Skip to content

Commit 3eaea0d

Browse files
committed
Merge tag 'for-6.1-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - fix a regression in nowait + buffered write - in zoned mode fix endianness when comparing super block generation - locking and lockdep fixes: - fix potential sleeping under spinlock when setting qgroup limit - lockdep warning fixes when btrfs_path is freed after copy_to_user - do not modify log tree while holding a leaf from fs tree locked - fix freeing of sysfs files of static features on error - use kv.alloc for zone map allocation as a fallback to avoid warnings due to high order allocation - send, avoid unaligned encoded writes when attempting to clone range * tag 'for-6.1-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs() btrfs: do not modify log tree while holding a leaf from fs tree locked btrfs: use kvcalloc in btrfs_get_dev_zone_info btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit() btrfs: send: avoid unaligned encoded writes when attempting to clone range btrfs: zoned: fix missing endianness conversion in sb_write_pointer btrfs: free btrfs_path before copying subvol info to userspace btrfs: free btrfs_path before copying fspath to userspace btrfs: free btrfs_path before copying inodes to userspace btrfs: free btrfs_path before copying root refs to userspace btrfs: fix assertion failure and blocking during nowait buffered write
2 parents 88817ac + ffdbb44 commit 3eaea0d

File tree

7 files changed

+132
-35
lines changed

7 files changed

+132
-35
lines changed

fs/btrfs/ctree.c

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4663,7 +4663,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
46634663
int ret;
46644664
int i;
46654665

4666-
ASSERT(!path->nowait);
4666+
/*
4667+
* The nowait semantics are used only for write paths, where we don't
4668+
* use the tree mod log and sequence numbers.
4669+
*/
4670+
if (time_seq)
4671+
ASSERT(!path->nowait);
46674672

46684673
nritems = btrfs_header_nritems(path->nodes[0]);
46694674
if (nritems == 0)
@@ -4683,7 +4688,14 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
46834688
if (path->need_commit_sem) {
46844689
path->need_commit_sem = 0;
46854690
need_commit_sem = true;
4686-
down_read(&fs_info->commit_root_sem);
4691+
if (path->nowait) {
4692+
if (!down_read_trylock(&fs_info->commit_root_sem)) {
4693+
ret = -EAGAIN;
4694+
goto done;
4695+
}
4696+
} else {
4697+
down_read(&fs_info->commit_root_sem);
4698+
}
46874699
}
46884700
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
46894701
}
@@ -4759,7 +4771,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
47594771
next = c;
47604772
ret = read_block_for_search(root, path, &next, level,
47614773
slot, &key);
4762-
if (ret == -EAGAIN)
4774+
if (ret == -EAGAIN && !path->nowait)
47634775
goto again;
47644776

47654777
if (ret < 0) {
@@ -4769,6 +4781,10 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
47694781

47704782
if (!path->skip_locking) {
47714783
ret = btrfs_try_tree_read_lock(next);
4784+
if (!ret && path->nowait) {
4785+
ret = -EAGAIN;
4786+
goto done;
4787+
}
47724788
if (!ret && time_seq) {
47734789
/*
47744790
* If we don't get the lock, we may be racing
@@ -4799,16 +4815,24 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
47994815

48004816
ret = read_block_for_search(root, path, &next, level,
48014817
0, &key);
4802-
if (ret == -EAGAIN)
4818+
if (ret == -EAGAIN && !path->nowait)
48034819
goto again;
48044820

48054821
if (ret < 0) {
48064822
btrfs_release_path(path);
48074823
goto done;
48084824
}
48094825

4810-
if (!path->skip_locking)
4811-
btrfs_tree_read_lock(next);
4826+
if (!path->skip_locking) {
4827+
if (path->nowait) {
4828+
if (!btrfs_try_tree_read_lock(next)) {
4829+
ret = -EAGAIN;
4830+
goto done;
4831+
}
4832+
} else {
4833+
btrfs_tree_read_lock(next);
4834+
}
4835+
}
48124836
}
48134837
ret = 0;
48144838
done:

fs/btrfs/ioctl.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
31053105
}
31063106
}
31073107

3108+
btrfs_free_path(path);
3109+
path = NULL;
31083110
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
31093111
ret = -EFAULT;
31103112

@@ -3194,6 +3196,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
31943196
}
31953197

31963198
out:
3199+
btrfs_free_path(path);
3200+
31973201
if (!ret || ret == -EOVERFLOW) {
31983202
rootrefs->num_items = found;
31993203
/* update min_treeid for next search */
@@ -3205,7 +3209,6 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
32053209
}
32063210

32073211
kfree(rootrefs);
3208-
btrfs_free_path(path);
32093212

32103213
return ret;
32113214
}
@@ -4231,6 +4234,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
42314234
ipath->fspath->val[i] = rel_ptr;
42324235
}
42334236

4237+
btrfs_free_path(path);
4238+
path = NULL;
42344239
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
42354240
ipath->fspath, size);
42364241
if (ret) {
@@ -4281,21 +4286,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
42814286
size = min_t(u32, loi->size, SZ_16M);
42824287
}
42834288

4284-
path = btrfs_alloc_path();
4285-
if (!path) {
4286-
ret = -ENOMEM;
4287-
goto out;
4288-
}
4289-
42904289
inodes = init_data_container(size);
42914290
if (IS_ERR(inodes)) {
42924291
ret = PTR_ERR(inodes);
4293-
inodes = NULL;
4294-
goto out;
4292+
goto out_loi;
42954293
}
42964294

4295+
path = btrfs_alloc_path();
4296+
if (!path) {
4297+
ret = -ENOMEM;
4298+
goto out;
4299+
}
42974300
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
42984301
inodes, ignore_offset);
4302+
btrfs_free_path(path);
42994303
if (ret == -EINVAL)
43004304
ret = -ENOENT;
43014305
if (ret < 0)
@@ -4307,7 +4311,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
43074311
ret = -EFAULT;
43084312

43094313
out:
4310-
btrfs_free_path(path);
43114314
kvfree(inodes);
43124315
out_loi:
43134316
kfree(loi);

fs/btrfs/qgroup.c

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2951,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
29512951
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
29522952
dstgroup->rsv_excl = inherit->lim.rsv_excl;
29532953

2954-
ret = update_qgroup_limit_item(trans, dstgroup);
2955-
if (ret) {
2956-
qgroup_mark_inconsistent(fs_info);
2957-
btrfs_info(fs_info,
2958-
"unable to update quota limit for %llu",
2959-
dstgroup->qgroupid);
2960-
goto unlock;
2961-
}
2954+
qgroup_dirty(fs_info, dstgroup);
29622955
}
29632956

29642957
if (srcid) {

fs/btrfs/send.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5702,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
57025702
u64 ext_len;
57035703
u64 clone_len;
57045704
u64 clone_data_offset;
5705+
bool crossed_src_i_size = false;
57055706

57065707
if (slot >= btrfs_header_nritems(leaf)) {
57075708
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5759,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
57595760
if (key.offset >= clone_src_i_size)
57605761
break;
57615762

5762-
if (key.offset + ext_len > clone_src_i_size)
5763+
if (key.offset + ext_len > clone_src_i_size) {
57635764
ext_len = clone_src_i_size - key.offset;
5765+
crossed_src_i_size = true;
5766+
}
57645767

57655768
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
57665769
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5821,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
58215824
ret = send_clone(sctx, offset, clone_len,
58225825
clone_root);
58235826
}
5827+
} else if (crossed_src_i_size && clone_len < len) {
5828+
/*
5829+
* If we are at i_size of the clone source inode and we
5830+
* can not clone from it, terminate the loop. This is
5831+
* to avoid sending two write operations, one with a
5832+
* length matching clone_len and the final one after
5833+
* this loop with a length of len - clone_len.
5834+
*
5835+
* When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
5836+
* was passed to the send ioctl), this helps avoid
5837+
* sending an encoded write for an offset that is not
5838+
* sector size aligned, in case the i_size of the source
5839+
* inode is not sector size aligned. That will make the
5840+
* receiver fallback to decompression of the data and
5841+
* writing it using regular buffered IO, therefore while
5842+
* not incorrect, it's not optimal due decompression and
5843+
* possible re-compression at the receiver.
5844+
*/
5845+
break;
58245846
} else {
58255847
ret = send_extent_data(sctx, dst_path, offset,
58265848
clone_len);

fs/btrfs/sysfs.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2321,8 +2321,11 @@ int __init btrfs_init_sysfs(void)
23212321

23222322
#ifdef CONFIG_BTRFS_DEBUG
23232323
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
2324-
if (ret)
2325-
goto out2;
2324+
if (ret) {
2325+
sysfs_unmerge_group(&btrfs_kset->kobj,
2326+
&btrfs_static_feature_attr_group);
2327+
goto out_remove_group;
2328+
}
23262329
#endif
23272330

23282331
return 0;

fs/btrfs/tree-log.c

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3694,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
36943694
u64 *last_old_dentry_offset)
36953695
{
36963696
struct btrfs_root *log = inode->root->log_root;
3697-
struct extent_buffer *src = path->nodes[0];
3698-
const int nritems = btrfs_header_nritems(src);
3697+
struct extent_buffer *src;
3698+
const int nritems = btrfs_header_nritems(path->nodes[0]);
36993699
const u64 ino = btrfs_ino(inode);
37003700
bool last_found = false;
37013701
int batch_start = 0;
37023702
int batch_size = 0;
37033703
int i;
37043704

3705-
for (i = path->slots[0]; i < nritems; i++) {
3705+
/*
3706+
* We need to clone the leaf, release the read lock on it, and use the
3707+
* clone before modifying the log tree. See the comment at copy_items()
3708+
* about why we need to do this.
3709+
*/
3710+
src = btrfs_clone_extent_buffer(path->nodes[0]);
3711+
if (!src)
3712+
return -ENOMEM;
3713+
3714+
i = path->slots[0];
3715+
btrfs_release_path(path);
3716+
path->nodes[0] = src;
3717+
path->slots[0] = i;
3718+
3719+
for (; i < nritems; i++) {
37063720
struct btrfs_dir_item *di;
37073721
struct btrfs_key key;
37083722
int ret;
@@ -4303,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
43034317
{
43044318
struct btrfs_root *log = inode->root->log_root;
43054319
struct btrfs_file_extent_item *extent;
4306-
struct extent_buffer *src = src_path->nodes[0];
4320+
struct extent_buffer *src;
43074321
int ret = 0;
43084322
struct btrfs_key *ins_keys;
43094323
u32 *ins_sizes;
@@ -4314,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
43144328
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
43154329
const u64 i_size = i_size_read(&inode->vfs_inode);
43164330

4331+
/*
4332+
* To keep lockdep happy and avoid deadlocks, clone the source leaf and
4333+
* use the clone. This is because otherwise we would be changing the log
4334+
* tree, to insert items from the subvolume tree or insert csum items,
4335+
* while holding a read lock on a leaf from the subvolume tree, which
4336+
* creates a nasty lock dependency when COWing log tree nodes/leaves:
4337+
*
4338+
* 1) Modifying the log tree triggers an extent buffer allocation while
4339+
* holding a write lock on a parent extent buffer from the log tree.
4340+
* Allocating the pages for an extent buffer, or the extent buffer
4341+
* struct, can trigger inode eviction and finally the inode eviction
4342+
* will trigger a release/remove of a delayed node, which requires
4343+
* taking the delayed node's mutex;
4344+
*
4345+
* 2) Allocating a metadata extent for a log tree can trigger the async
4346+
* reclaim thread and make us wait for it to release enough space and
4347+
* unblock our reservation ticket. The reclaim thread can start
4348+
* flushing delayed items, and that in turn results in the need to
4349+
* lock delayed node mutexes and in the need to write lock extent
4350+
* buffers of a subvolume tree - all this while holding a write lock
4351+
* on the parent extent buffer in the log tree.
4352+
*
4353+
* So one task in scenario 1) running in parallel with another task in
4354+
* scenario 2) could lead to a deadlock, one wanting to lock a delayed
4355+
* node mutex while having a read lock on a leaf from the subvolume,
4356+
* while the other is holding the delayed node's mutex and wants to
4357+
* write lock the same subvolume leaf for flushing delayed items.
4358+
*/
4359+
src = btrfs_clone_extent_buffer(src_path->nodes[0]);
4360+
if (!src)
4361+
return -ENOMEM;
4362+
4363+
i = src_path->slots[0];
4364+
btrfs_release_path(src_path);
4365+
src_path->nodes[0] = src;
4366+
src_path->slots[0] = i;
4367+
43174368
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
43184369
nr * sizeof(u32), GFP_NOFS);
43194370
if (!ins_data)

fs/btrfs/zoned.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
134134
super[i] = page_address(page[i]);
135135
}
136136

137-
if (super[0]->generation > super[1]->generation)
137+
if (btrfs_super_generation(super[0]) >
138+
btrfs_super_generation(super[1]))
138139
sector = zones[1].start;
139140
else
140141
sector = zones[0].start;
@@ -466,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
466467
goto out;
467468
}
468469

469-
zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
470+
zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
470471
if (!zones) {
471472
ret = -ENOMEM;
472473
goto out;
@@ -585,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
585586
}
586587

587588

588-
kfree(zones);
589+
kvfree(zones);
589590

590591
switch (bdev_zoned_model(bdev)) {
591592
case BLK_ZONED_HM:
@@ -617,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
617618
return 0;
618619

619620
out:
620-
kfree(zones);
621+
kvfree(zones);
621622
out_free_zone_info:
622623
btrfs_destroy_dev_zone_info(device);
623624

0 commit comments

Comments
 (0)