Skip to content

Commit 7a23390

Browse files
boryaskdave
authored andcommitted
btrfs: fix read corruption due to race with extent map merging
In debugging some corrupt squashfs files, we observed symptoms of corrupt page cache pages but correct on-disk contents. Further investigation revealed that the exact symptom was a correct page followed by an incorrect, duplicate, page. This got us thinking about extent maps. commit ac05ca9 ("Btrfs: fix race between using extent maps and merging them") enforces a reference count on the primary `em` extent_map being merged, as that one gets modified. However, since, commit 3d2ac99 ("btrfs: introduce new members for extent_map") both 'em' and 'merge' get modified, which started modifying 'merge' and thus introduced the same race. We were able to reproduce this by looping the affected squashfs workload in parallel on a bunch of separate btrfs-es while also dropping caches. We are still working on a simple enough reproducer to make into an fstest. The simplest fix is to stop modifying 'merge', which is not essential, as it is dropped immediately after the merge. This behavior is simply a consequence of the order of the two extent maps being important in computing the new values. Modify merge_ondisk_extents to take prev and next by const* and also take a third merged parameter that it puts the results in. Note that this introduces the rather odd behavior of passing 'em' to merge_ondisk_extents as a const * and as a regular ptr. Fixes: 3d2ac99 ("btrfs: introduce new members for extent_map") CC: [email protected] # 6.11+ Reviewed-by: Qu Wenruo <[email protected]> Reviewed-by: Filipe Manana <[email protected]> Signed-off-by: Omar Sandoval <[email protected]> Signed-off-by: Boris Burkov <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent f10f59f commit 7a23390

File tree

1 file changed

+16
-15
lines changed

1 file changed

+16
-15
lines changed

fs/btrfs/extent_map.c

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -243,13 +243,19 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
243243
/*
244244
* Handle the on-disk data extents merge for @prev and @next.
245245
*
246+
* @prev: left extent to merge
247+
* @next: right extent to merge
248+
* @merged: the extent we will not discard after the merge; updated with new values
249+
*
250+
* After this, one of the two extents is the new merged extent and the other is
251+
* removed from the tree and likely freed. Note that @merged is one of @prev/@next
252+
* so there is const/non-const aliasing occurring here.
253+
*
246254
* Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
247255
* For now only uncompressed regular extent can be merged.
248-
*
249-
* @prev and @next will be both updated to point to the new merged range.
250-
* Thus one of them should be removed by the caller.
251256
*/
252-
static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
257+
static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
258+
struct extent_map *merged)
253259
{
254260
u64 new_disk_bytenr;
255261
u64 new_disk_num_bytes;
@@ -284,15 +290,10 @@ static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *nex
284290
new_disk_bytenr;
285291
new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
286292

287-
prev->disk_bytenr = new_disk_bytenr;
288-
prev->disk_num_bytes = new_disk_num_bytes;
289-
prev->ram_bytes = new_disk_num_bytes;
290-
prev->offset = new_offset;
291-
292-
next->disk_bytenr = new_disk_bytenr;
293-
next->disk_num_bytes = new_disk_num_bytes;
294-
next->ram_bytes = new_disk_num_bytes;
295-
next->offset = new_offset;
293+
merged->disk_bytenr = new_disk_bytenr;
294+
merged->disk_num_bytes = new_disk_num_bytes;
295+
merged->ram_bytes = new_disk_num_bytes;
296+
merged->offset = new_offset;
296297
}
297298

298299
static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
@@ -361,7 +362,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
361362
em->generation = max(em->generation, merge->generation);
362363

363364
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
364-
merge_ondisk_extents(merge, em);
365+
merge_ondisk_extents(merge, em, em);
365366
em->flags |= EXTENT_FLAG_MERGED;
366367

367368
validate_extent_map(fs_info, em);
@@ -378,7 +379,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
378379
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
379380
em->len += merge->len;
380381
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
381-
merge_ondisk_extents(em, merge);
382+
merge_ondisk_extents(em, merge, em);
382383
validate_extent_map(fs_info, em);
383384
rb_erase(&merge->rb_node, &tree->root);
384385
RB_CLEAR_NODE(&merge->rb_node);

0 commit comments

Comments
 (0)