Skip to content

Commit 81bd932

Browse files
Christoph Hellwigkdave
authored andcommitted
btrfs: fix repair of compressed extents
Currently the checksum of compressed extents is verified based on the compressed data and the lower btrfs_bio, but the actual repair process is driven by end_bio_extent_readpage on the upper btrfs_bio for the decompressed data. This has a bunch of issues, including not being able to properly communicate the failed mirror up in case that the I/O submission got preempted, a general loss of if an error was an I/O error or a checksum verification failure, but most importantly that this design causes btrfs_clean_io_failure to eventually write back the uncompressed good data onto the disk sectors that are supposed to contain compressed data. Fix this by moving the repair to the lower btrfs_bio. To do so, a fair amount of code has to be reshuffled: a) the lower btrfs_bio now needs a valid csum pointer. The easiest way to achieve that is to pass NULL btrfs_lookup_bio_sums and just use the btrfs_bio management of csums. For a compressed_bio that is split into multiple btrfs_bios this means additional memory allocations, but the code becomes a lot more regular. b) checksum verification now runs directly on the lower btrfs_bio instead of the compressed_bio. This actually nicely simplifies the end I/O processing. c) btrfs_repair_one_sector can't just look up the logical address for the file offset any more, as there is no corresponding relative offsets that apply to the file offset and the logic address for compressed extents. Instead require that the saved bvec_iter in the btrfs_bio is filled out for all read bios and use that, which again removes a fair amount of code. Reviewed-by: Nikolay Borisov <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 7959bd4 commit 81bd932

File tree

6 files changed

+76
-157
lines changed

6 files changed

+76
-157
lines changed

fs/btrfs/compression.c

Lines changed: 58 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -136,66 +136,14 @@ static int compression_decompress(int type, struct list_head *ws,
136136

137137
static int btrfs_decompress_bio(struct compressed_bio *cb);
138138

139-
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
140-
unsigned long disk_size)
141-
{
142-
return sizeof(struct compressed_bio) +
143-
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
144-
}
145-
146-
static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
147-
u64 disk_start)
148-
{
149-
struct btrfs_fs_info *fs_info = inode->root->fs_info;
150-
const u32 csum_size = fs_info->csum_size;
151-
const u32 sectorsize = fs_info->sectorsize;
152-
struct page *page;
153-
unsigned int i;
154-
u8 csum[BTRFS_CSUM_SIZE];
155-
struct compressed_bio *cb = bio->bi_private;
156-
u8 *cb_sum = cb->sums;
157-
158-
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
159-
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
160-
return 0;
161-
162-
for (i = 0; i < cb->nr_pages; i++) {
163-
u32 pg_offset;
164-
u32 bytes_left = PAGE_SIZE;
165-
page = cb->compressed_pages[i];
166-
167-
/* Determine the remaining bytes inside the page first */
168-
if (i == cb->nr_pages - 1)
169-
bytes_left = cb->compressed_len - i * PAGE_SIZE;
170-
171-
/* Hash through the page sector by sector */
172-
for (pg_offset = 0; pg_offset < bytes_left;
173-
pg_offset += sectorsize) {
174-
int ret;
175-
176-
ret = btrfs_check_sector_csum(fs_info, page, pg_offset,
177-
csum, cb_sum);
178-
if (ret) {
179-
btrfs_print_data_csum_error(inode, disk_start,
180-
csum, cb_sum, cb->mirror_num);
181-
if (btrfs_bio(bio)->device)
182-
btrfs_dev_stat_inc_and_print(
183-
btrfs_bio(bio)->device,
184-
BTRFS_DEV_STAT_CORRUPTION_ERRS);
185-
return -EIO;
186-
}
187-
cb_sum += csum_size;
188-
disk_start += sectorsize;
189-
}
190-
}
191-
return 0;
192-
}
193-
194139
static void finish_compressed_bio_read(struct compressed_bio *cb)
195140
{
196141
unsigned int index;
197142
struct page *page;
198143

144+
if (cb->status == BLK_STS_OK)
145+
cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
146+
199147
/* Release the compressed pages */
200148
for (index = 0; index < cb->nr_pages; index++) {
201149
page = cb->compressed_pages[index];
@@ -233,59 +181,54 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
233181
kfree(cb);
234182
}
235183

236-
/* when we finish reading compressed pages from the disk, we
237-
* decompress them and then run the bio end_io routines on the
238-
* decompressed pages (in the inode address space).
239-
*
240-
* This allows the checksumming and other IO error handling routines
241-
* to work normally
242-
*
243-
* The compressed pages are freed here, and it must be run
244-
* in process context
184+
/*
185+
* Verify the checksums and kick off repair if needed on the uncompressed data
186+
* before decompressing it into the original bio and freeing the uncompressed
187+
* pages.
245188
*/
246189
static void end_compressed_bio_read(struct bio *bio)
247190
{
248191
struct compressed_bio *cb = bio->bi_private;
249-
struct inode *inode;
250-
unsigned int mirror = btrfs_bio(bio)->mirror_num;
251-
int ret = 0;
252-
253-
if (bio->bi_status)
254-
cb->status = bio->bi_status;
255-
256-
if (!refcount_dec_and_test(&cb->pending_ios))
257-
goto out;
258-
259-
/*
260-
* Record the correct mirror_num in cb->orig_bio so that
261-
* read-repair can work properly.
262-
*/
263-
btrfs_bio(cb->orig_bio)->mirror_num = mirror;
264-
cb->mirror_num = mirror;
265-
266-
/*
267-
* Some IO in this cb have failed, just skip checksum as there
268-
* is no way it could be correct.
269-
*/
270-
if (cb->status != BLK_STS_OK)
271-
goto csum_failed;
192+
struct inode *inode = cb->inode;
193+
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
194+
struct btrfs_inode *bi = BTRFS_I(inode);
195+
bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
196+
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
197+
blk_status_t status = bio->bi_status;
198+
struct btrfs_bio *bbio = btrfs_bio(bio);
199+
struct bvec_iter iter;
200+
struct bio_vec bv;
201+
u32 offset;
202+
203+
btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
204+
u64 start = bbio->file_offset + offset;
205+
206+
if (!status &&
207+
(!csum || !btrfs_check_data_csum(inode, bbio, offset,
208+
bv.bv_page, bv.bv_offset))) {
209+
clean_io_failure(fs_info, &bi->io_failure_tree,
210+
&bi->io_tree, start, bv.bv_page,
211+
btrfs_ino(bi), bv.bv_offset);
212+
} else {
213+
int ret;
272214

273-
inode = cb->inode;
274-
ret = check_compressed_csum(BTRFS_I(inode), bio,
275-
bio->bi_iter.bi_sector << 9);
276-
if (ret)
277-
goto csum_failed;
215+
refcount_inc(&cb->pending_ios);
216+
ret = btrfs_repair_one_sector(inode, bbio, offset,
217+
bv.bv_page, bv.bv_offset,
218+
btrfs_submit_data_read_bio);
219+
if (ret) {
220+
refcount_dec(&cb->pending_ios);
221+
status = errno_to_blk_status(ret);
222+
}
223+
}
224+
}
278225

279-
/* ok, we're the last bio for this extent, lets start
280-
* the decompression.
281-
*/
282-
ret = btrfs_decompress_bio(cb);
226+
if (status)
227+
cb->status = status;
283228

284-
csum_failed:
285-
if (ret)
286-
cb->status = errno_to_blk_status(ret);
287-
finish_compressed_bio_read(cb);
288-
out:
229+
if (refcount_dec_and_test(&cb->pending_ios))
230+
finish_compressed_bio_read(cb);
231+
btrfs_bio_free_csum(bbio);
289232
bio_put(bio);
290233
}
291234

@@ -478,15 +421,14 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
478421

479422
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
480423
IS_ALIGNED(len, fs_info->sectorsize));
481-
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
424+
cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
482425
if (!cb)
483426
return BLK_STS_RESOURCE;
484427
refcount_set(&cb->pending_ios, 1);
485428
cb->status = BLK_STS_OK;
486429
cb->inode = &inode->vfs_inode;
487430
cb->start = start;
488431
cb->len = len;
489-
cb->mirror_num = 0;
490432
cb->compressed_pages = compressed_pages;
491433
cb->compressed_len = compressed_len;
492434
cb->writeback = writeback;
@@ -755,7 +697,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
755697
blk_status_t ret;
756698
int ret2;
757699
int i;
758-
u8 *sums;
759700

760701
em_tree = &BTRFS_I(inode)->extent_tree;
761702

@@ -773,7 +714,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
773714

774715
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
775716
compressed_len = em->block_len;
776-
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
717+
cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
777718
if (!cb) {
778719
ret = BLK_STS_RESOURCE;
779720
goto out;
@@ -782,8 +723,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
782723
refcount_set(&cb->pending_ios, 1);
783724
cb->status = BLK_STS_OK;
784725
cb->inode = inode;
785-
cb->mirror_num = mirror_num;
786-
sums = cb->sums;
787726

788727
cb->start = em->orig_start;
789728
em_len = em->len;
@@ -866,19 +805,25 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
866805
submit = true;
867806

868807
if (submit) {
869-
unsigned int nr_sectors;
808+
/* Save the original iter for read repair */
809+
if (bio_op(comp_bio) == REQ_OP_READ)
810+
btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
811+
812+
/*
813+
* Save the initial offset of this chunk, as there
814+
* is no direct correlation between compressed pages and
815+
* the original file offset. The field is only used for
816+
* priting error messages.
817+
*/
818+
btrfs_bio(comp_bio)->file_offset = file_offset;
870819

871-
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
820+
ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
872821
if (ret) {
873822
comp_bio->bi_status = ret;
874823
bio_endio(comp_bio);
875824
break;
876825
}
877826

878-
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
879-
fs_info->sectorsize);
880-
sums += fs_info->csum_size * nr_sectors;
881-
882827
ASSERT(comp_bio->bi_iter.bi_size);
883828
btrfs_submit_bio(fs_info, comp_bio, mirror_num);
884829
comp_bio = NULL;

fs/btrfs/compression.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,12 @@ struct compressed_bio {
5959

6060
/* IO errors */
6161
blk_status_t status;
62-
int mirror_num;
6362

6463
union {
6564
/* For reads, this is the bio we are copying the data into */
6665
struct bio *orig_bio;
6766
struct work_struct write_end_work;
6867
};
69-
70-
/*
71-
* the start of a variable length array of checksums only
72-
* used by reads
73-
*/
74-
u8 sums[];
7568
};
7669

7770
static inline unsigned int btrfs_compress_type(unsigned int type_level)

fs/btrfs/ctree.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3293,6 +3293,8 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
32933293
int mirror_num, enum btrfs_compression_type compress_type);
32943294
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
32953295
u32 pgoff, u8 *csum, const u8 * const csum_expected);
3296+
int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
3297+
u32 bio_offset, struct page *page, u32 pgoff);
32963298
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
32973299
u32 bio_offset, struct page *page,
32983300
u64 start, u64 end);

fs/btrfs/extent_io.c

Lines changed: 9 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2545,13 +2545,10 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
25452545
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
25462546
u64 start = bbio->file_offset + bio_offset;
25472547
struct io_failure_record *failrec;
2548-
struct extent_map *em;
25492548
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
25502549
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2551-
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
25522550
const u32 sectorsize = fs_info->sectorsize;
25532551
int ret;
2554-
u64 logical;
25552552

25562553
failrec = get_state_failrec(failure_tree, start);
25572554
if (!IS_ERR(failrec)) {
@@ -2576,41 +2573,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
25762573
failrec->len = sectorsize;
25772574
failrec->failed_mirror = bbio->mirror_num;
25782575
failrec->this_mirror = bbio->mirror_num;
2579-
failrec->compress_type = BTRFS_COMPRESS_NONE;
2580-
2581-
read_lock(&em_tree->lock);
2582-
em = lookup_extent_mapping(em_tree, start, failrec->len);
2583-
if (!em) {
2584-
read_unlock(&em_tree->lock);
2585-
kfree(failrec);
2586-
return ERR_PTR(-EIO);
2587-
}
2588-
2589-
if (em->start > start || em->start + em->len <= start) {
2590-
free_extent_map(em);
2591-
em = NULL;
2592-
}
2593-
read_unlock(&em_tree->lock);
2594-
if (!em) {
2595-
kfree(failrec);
2596-
return ERR_PTR(-EIO);
2597-
}
2598-
2599-
logical = start - em->start;
2600-
logical = em->block_start + logical;
2601-
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2602-
logical = em->block_start;
2603-
failrec->compress_type = em->compress_type;
2604-
}
2576+
failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
26052577

26062578
btrfs_debug(fs_info,
2607-
"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2608-
logical, start, failrec->len);
2609-
2610-
failrec->logical = logical;
2611-
free_extent_map(em);
2579+
"new io failure record logical %llu start %llu",
2580+
failrec->logical, start);
26122581

2613-
failrec->num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
2582+
failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
26142583
if (failrec->num_copies == 1) {
26152584
/*
26162585
* We only have a single copy of the data, so don't bother with
@@ -2709,7 +2678,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
27092678
* will be handled by the endio on the repair_bio, so we can't return an
27102679
* error here.
27112680
*/
2712-
submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type);
2681+
submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
27132682
return BLK_STS_OK;
27142683
}
27152684

@@ -3117,6 +3086,10 @@ static void end_bio_extent_readpage(struct bio *bio)
31173086
* Only try to repair bios that actually made it to a
31183087
* device. If the bio failed to be submitted mirror
31193088
* is 0 and we need to fail it without retrying.
3089+
*
3090+
* This also includes the high level bios for compressed
3091+
* extents - these never make it to a device and repair
3092+
* is already handled on the lower compressed bio.
31203093
*/
31213094
if (mirror > 0)
31223095
repair = true;

fs/btrfs/extent_io.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,6 @@ struct io_failure_record {
261261
u64 start;
262262
u64 len;
263263
u64 logical;
264-
enum btrfs_compression_type compress_type;
265264
int this_mirror;
266265
int failed_mirror;
267266
int num_copies;

fs/btrfs/inode.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2749,6 +2749,9 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
27492749
return;
27502750
}
27512751

2752+
/* Save the original iter for read repair */
2753+
btrfs_bio(bio)->iter = bio->bi_iter;
2754+
27522755
/*
27532756
* Lookup bio sums does extra checks around whether we need to csum or
27542757
* not, which is why we ignore skip_sum here.
@@ -8060,6 +8063,10 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
80608063
struct btrfs_dio_private *dip = bio->bi_private;
80618064
blk_status_t ret;
80628065

8066+
/* Save the original iter for read repair */
8067+
if (btrfs_op(bio) == BTRFS_MAP_READ)
8068+
btrfs_bio(bio)->iter = bio->bi_iter;
8069+
80638070
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
80648071
goto map;
80658072

0 commit comments

Comments
 (0)