Skip to content

Commit 7af5974

Browse files
fdmananakdave
authored andcommitted
btrfs: make full fsyncs always operate on the entire file again
This is a revert of commit 0a8068a ("btrfs: make ranged full fsyncs more efficient"), with updated comment in btrfs_sync_file. Commit 0a8068a ("btrfs: make ranged full fsyncs more efficient") made full fsyncs operate on the given range only as it assumed it was safe when using the NO_HOLES feature, since the hole detection was simplified some time ago and no longer was a source for races with ordered extent completion of adjacent file ranges. However it's still not safe to have a full fsync only operate on the given range, because extent maps for new extents might not be present in memory due to inode eviction or extent cloning. Consider the following example: 1) We are currently at transaction N; 2) We write to the file range [0, 1MiB); 3) Writeback finishes for the whole range and ordered extents complete, while we are still at transaction N; 4) The inode is evicted; 5) We open the file for writing, causing the inode to be loaded to memory again, which sets the 'full sync' bit on its flags. At this point the inode's list of modified extent maps is empty (figuring out which extents were created in the current transaction and were not yet logged by an fsync is expensive, that's why we set the 'full sync' bit when loading an inode); 6) We write to the file range [512KiB, 768KiB); 7) We do a ranged fsync (such as msync()) for file range [512KiB, 768KiB). This correctly flushes this range and logs its extent into the log tree. When the writeback started an extent map for range [512KiB, 768KiB) was added to the inode's list of modified extents, and when the fsync() finishes logging it removes that extent map from the list of modified extent maps. This fsync also clears the 'full sync' bit; 8) We do a regular fsync() (full ranged). This fsync() ends up doing nothing because the inode's list of modified extents is empty and no other changes happened since the previous ranged fsync(), so it just returns success (0) and we end up never logging extents for the file ranges [0, 512KiB) and [768KiB, 1MiB). Another scenario where this can happen is if we replace steps 2 to 4 with cloning from another file into our test file, as that sets the 'full sync' bit in our inode's flags and does not populate its list of modified extent maps. This was causing test case generic/457 to fail sporadically when using the NO_HOLES feature, as it exercised this later case where the inode has the 'full sync' bit set and has no extent maps in memory to represent the new extents due to extent cloning. Fix this by reverting commit 0a8068a ("btrfs: make ranged full fsyncs more efficient") since there is no easy way to work around it. Fixes: 0a8068a ("btrfs: make ranged full fsyncs more efficient") Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 4fdb688 commit 7af5974

File tree

2 files changed

+29
-79
lines changed

2 files changed

+29
-79
lines changed

fs/btrfs/file.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,6 +2097,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
20972097

20982098
atomic_inc(&root->log_batch);
20992099

2100+
/*
2101+
* If the inode needs a full sync, make sure we use a full range to
2102+
* avoid log tree corruption, due to hole detection racing with ordered
2103+
* extent completion for adjacent ranges and races between logging and
2104+
* completion of ordered extents for adjancent ranges - both races
2105+
* could lead to file extent items in the log with overlapping ranges.
2106+
* Do this while holding the inode lock, to avoid races with other
2107+
* tasks.
2108+
*/
2109+
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2110+
&BTRFS_I(inode)->runtime_flags)) {
2111+
start = 0;
2112+
end = LLONG_MAX;
2113+
}
2114+
21002115
/*
21012116
* Before we acquired the inode's lock, someone may have dirtied more
21022117
* pages in the target range. We need to make sure that writeback for

fs/btrfs/tree-log.c

Lines changed: 14 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ enum {
9696
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
9797
struct btrfs_root *root, struct btrfs_inode *inode,
9898
int inode_only,
99-
u64 start,
100-
u64 end,
99+
const loff_t start,
100+
const loff_t end,
101101
struct btrfs_log_ctx *ctx);
102102
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
103103
struct btrfs_root *root,
@@ -4533,37 +4533,28 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
45334533
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
45344534
struct btrfs_root *root,
45354535
struct btrfs_inode *inode,
4536-
struct btrfs_path *path,
4537-
const u64 start,
4538-
const u64 end)
4536+
struct btrfs_path *path)
45394537
{
45404538
struct btrfs_fs_info *fs_info = root->fs_info;
45414539
struct btrfs_key key;
45424540
const u64 ino = btrfs_ino(inode);
45434541
const u64 i_size = i_size_read(&inode->vfs_inode);
4544-
u64 prev_extent_end = start;
4542+
u64 prev_extent_end = 0;
45454543
int ret;
45464544

45474545
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
45484546
return 0;
45494547

45504548
key.objectid = ino;
45514549
key.type = BTRFS_EXTENT_DATA_KEY;
4552-
key.offset = start;
4550+
key.offset = 0;
45534551

45544552
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
45554553
if (ret < 0)
45564554
return ret;
45574555

4558-
if (ret > 0 && path->slots[0] > 0) {
4559-
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4560-
if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
4561-
path->slots[0]--;
4562-
}
4563-
45644556
while (true) {
45654557
struct extent_buffer *leaf = path->nodes[0];
4566-
u64 extent_end;
45674558

45684559
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
45694560
ret = btrfs_next_leaf(root, path);
@@ -4580,18 +4571,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
45804571
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
45814572
break;
45824573

4583-
extent_end = btrfs_file_extent_end(path);
4584-
if (extent_end <= start)
4585-
goto next_slot;
4586-
45874574
/* We have a hole, log it. */
45884575
if (prev_extent_end < key.offset) {
4589-
u64 hole_len;
4590-
4591-
if (key.offset >= end)
4592-
hole_len = end - prev_extent_end;
4593-
else
4594-
hole_len = key.offset - prev_extent_end;
4576+
const u64 hole_len = key.offset - prev_extent_end;
45954577

45964578
/*
45974579
* Release the path to avoid deadlocks with other code
@@ -4621,20 +4603,16 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
46214603
leaf = path->nodes[0];
46224604
}
46234605

4624-
prev_extent_end = min(extent_end, end);
4625-
if (extent_end >= end)
4626-
break;
4627-
next_slot:
4606+
prev_extent_end = btrfs_file_extent_end(path);
46284607
path->slots[0]++;
46294608
cond_resched();
46304609
}
46314610

4632-
if (prev_extent_end < end && prev_extent_end < i_size) {
4611+
if (prev_extent_end < i_size) {
46334612
u64 hole_len;
46344613

46354614
btrfs_release_path(path);
4636-
hole_len = min(ALIGN(i_size, fs_info->sectorsize), end);
4637-
hole_len -= prev_extent_end;
4615+
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
46384616
ret = btrfs_insert_file_extent(trans, root->log_root,
46394617
ino, prev_extent_end, 0, 0,
46404618
hole_len, 0, hole_len,
@@ -4971,8 +4949,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
49714949
const u64 logged_isize,
49724950
const bool recursive_logging,
49734951
const int inode_only,
4974-
const u64 start,
4975-
const u64 end,
49764952
struct btrfs_log_ctx *ctx,
49774953
bool *need_log_inode_item)
49784954
{
@@ -4981,21 +4957,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
49814957
int ins_nr = 0;
49824958
int ret;
49834959

4984-
/*
4985-
* We must make sure we don't copy extent items that are entirely out of
4986-
* the range [start, end - 1]. This is not just an optimization to avoid
4987-
* copying but also needed to avoid a corruption where we end up with
4988-
* file extent items in the log tree that have overlapping ranges - this
4989-
* can happen if we race with ordered extent completion for ranges that
4990-
* are outside our target range. For example we copy an extent item and
4991-
* when we move to the next leaf, that extent was trimmed and a new one
4992-
* covering a subrange of it, but with a higher key, was inserted - we
4993-
* would then copy this other extent too, resulting in a log tree with
4994-
* 2 extent items that represent overlapping ranges.
4995-
*
4996-
* We can copy the entire extents at the range bondaries however, even
4997-
* if they cover an area outside the target range. That's ok.
4998-
*/
49994960
while (1) {
50004961
ret = btrfs_search_forward(root, min_key, path, trans->transid);
50014962
if (ret < 0)
@@ -5063,29 +5024,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
50635024
goto next_slot;
50645025
}
50655026

5066-
if (min_key->type == BTRFS_EXTENT_DATA_KEY) {
5067-
const u64 extent_end = btrfs_file_extent_end(path);
5068-
5069-
if (extent_end <= start) {
5070-
if (ins_nr > 0) {
5071-
ret = copy_items(trans, inode, dst_path,
5072-
path, ins_start_slot,
5073-
ins_nr, inode_only,
5074-
logged_isize);
5075-
if (ret < 0)
5076-
return ret;
5077-
ins_nr = 0;
5078-
}
5079-
goto next_slot;
5080-
}
5081-
if (extent_end >= end) {
5082-
ins_nr++;
5083-
if (ins_nr == 1)
5084-
ins_start_slot = path->slots[0];
5085-
break;
5086-
}
5087-
}
5088-
50895027
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
50905028
ins_nr++;
50915029
goto next_slot;
@@ -5151,8 +5089,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
51515089
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51525090
struct btrfs_root *root, struct btrfs_inode *inode,
51535091
int inode_only,
5154-
u64 start,
5155-
u64 end,
5092+
const loff_t start,
5093+
const loff_t end,
51565094
struct btrfs_log_ctx *ctx)
51575095
{
51585096
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5180,9 +5118,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51805118
return -ENOMEM;
51815119
}
51825120

5183-
start = ALIGN_DOWN(start, fs_info->sectorsize);
5184-
end = ALIGN(end, fs_info->sectorsize);
5185-
51865121
min_key.objectid = ino;
51875122
min_key.type = BTRFS_INODE_ITEM_KEY;
51885123
min_key.offset = 0;
@@ -5298,8 +5233,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
52985233

52995234
err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
53005235
path, dst_path, logged_isize,
5301-
recursive_logging, inode_only,
5302-
start, end, ctx, &need_log_inode_item);
5236+
recursive_logging, inode_only, ctx,
5237+
&need_log_inode_item);
53035238
if (err)
53045239
goto out_unlock;
53055240

@@ -5312,7 +5247,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
53125247
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
53135248
btrfs_release_path(path);
53145249
btrfs_release_path(dst_path);
5315-
err = btrfs_log_holes(trans, root, inode, path, start, end);
5250+
err = btrfs_log_holes(trans, root, inode, path);
53165251
if (err)
53175252
goto out_unlock;
53185253
}

0 commit comments

Comments
 (0)