Skip to content

Commit 1fb9189

Browse files
committed
Merge tag 'for-6.11-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - extend tree-checker verification of directory item type - fix regression in page/folio and extent state tracking in xarray, the dirty status can get out of sync and can cause problems e.g. a hang - in send, detect last extent and allow to clone it instead of sending it as write, reduces amount of data transferred in the stream - fix checking extent references when cleaning deleted subvolumes - fix one more case in the extent map shrinker, let it run only in the kswapd context so it does not cause latency spikes during other operations * tag 'for-6.11-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix invalid mapping of extent xarray state btrfs: send: allow cloning non-aligned extent if it ends at i_size btrfs: only run the extent map shrinker from kswapd tasks btrfs: tree-checker: reject BTRFS_FT_UNKNOWN dir type btrfs: check delayed refs when we're checking if a ref exists
2 parents d07b432 + 6252690 commit 1fb9189

File tree

8 files changed

+179
-44
lines changed

8 files changed

+179
-44
lines changed

fs/btrfs/delayed-ref.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
11341134
return find_ref_head(delayed_refs, bytenr, false);
11351135
}
11361136

1137+
static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
1138+
{
1139+
int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY;
1140+
1141+
if (type < entry->type)
1142+
return -1;
1143+
if (type > entry->type)
1144+
return 1;
1145+
1146+
if (type == BTRFS_TREE_BLOCK_REF_KEY) {
1147+
if (root < entry->ref_root)
1148+
return -1;
1149+
if (root > entry->ref_root)
1150+
return 1;
1151+
} else {
1152+
if (parent < entry->parent)
1153+
return -1;
1154+
if (parent > entry->parent)
1155+
return 1;
1156+
}
1157+
return 0;
1158+
}
1159+
1160+
/*
1161+
* Check to see if a given root/parent reference is attached to the head. This
1162+
* only checks for BTRFS_ADD_DELAYED_REF references that match, as that
1163+
* indicates the reference exists for the given root or parent. This is for
1164+
* tree blocks only.
1165+
*
1166+
* @head: the head of the bytenr we're searching.
1167+
* @root: the root objectid of the reference if it is a normal reference.
1168+
* @parent: the parent if this is a shared backref.
1169+
*/
1170+
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
1171+
u64 root, u64 parent)
1172+
{
1173+
struct rb_node *node;
1174+
bool found = false;
1175+
1176+
lockdep_assert_held(&head->mutex);
1177+
1178+
spin_lock(&head->lock);
1179+
node = head->ref_tree.rb_root.rb_node;
1180+
while (node) {
1181+
struct btrfs_delayed_ref_node *entry;
1182+
int ret;
1183+
1184+
entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
1185+
ret = find_comp(entry, root, parent);
1186+
if (ret < 0) {
1187+
node = node->rb_left;
1188+
} else if (ret > 0) {
1189+
node = node->rb_right;
1190+
} else {
1191+
/*
1192+
* We only want to count ADD actions, as drops mean the
1193+
* ref doesn't exist.
1194+
*/
1195+
if (entry->action == BTRFS_ADD_DELAYED_REF)
1196+
found = true;
1197+
break;
1198+
}
1199+
}
1200+
spin_unlock(&head->lock);
1201+
return found;
1202+
}
1203+
11371204
void __cold btrfs_delayed_ref_exit(void)
11381205
{
11391206
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);

fs/btrfs/delayed-ref.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
389389
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
390390
enum btrfs_reserve_flush_enum flush);
391391
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
392+
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
393+
u64 root, u64 parent);
392394

393395
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
394396
{

fs/btrfs/extent-tree.c

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
54725472
struct btrfs_root *root, u64 bytenr, u64 parent,
54735473
int level)
54745474
{
5475+
struct btrfs_delayed_ref_root *delayed_refs;
5476+
struct btrfs_delayed_ref_head *head;
54755477
struct btrfs_path *path;
54765478
struct btrfs_extent_inline_ref *iref;
54775479
int ret;
5480+
bool exists = false;
54785481

54795482
path = btrfs_alloc_path();
54805483
if (!path)
54815484
return -ENOMEM;
5482-
5485+
again:
54835486
ret = lookup_extent_backref(trans, path, &iref, bytenr,
54845487
root->fs_info->nodesize, parent,
54855488
btrfs_root_id(root), level, 0);
5489+
if (ret != -ENOENT) {
5490+
/*
5491+
* If we get 0 then we found our reference, return 1, else
5492+
* return the error if it's not -ENOENT;
5493+
*/
5494+
btrfs_free_path(path);
5495+
return (ret < 0 ) ? ret : 1;
5496+
}
5497+
5498+
/*
5499+
* We could have a delayed ref with this reference, so look it up while
5500+
* we're holding the path open to make sure we don't race with the
5501+
* delayed ref running.
5502+
*/
5503+
delayed_refs = &trans->transaction->delayed_refs;
5504+
spin_lock(&delayed_refs->lock);
5505+
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
5506+
if (!head)
5507+
goto out;
5508+
if (!mutex_trylock(&head->mutex)) {
5509+
/*
5510+
* We're contended, means that the delayed ref is running, get a
5511+
* reference and wait for the ref head to be complete and then
5512+
* try again.
5513+
*/
5514+
refcount_inc(&head->refs);
5515+
spin_unlock(&delayed_refs->lock);
5516+
5517+
btrfs_release_path(path);
5518+
5519+
mutex_lock(&head->mutex);
5520+
mutex_unlock(&head->mutex);
5521+
btrfs_put_delayed_ref_head(head);
5522+
goto again;
5523+
}
5524+
5525+
exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
5526+
mutex_unlock(&head->mutex);
5527+
out:
5528+
spin_unlock(&delayed_refs->lock);
54865529
btrfs_free_path(path);
5487-
if (ret == -ENOENT)
5488-
return 0;
5489-
if (ret < 0)
5490-
return ret;
5491-
return 1;
5530+
return exists ? 1 : 0;
54925531
}
54935532

54945533
/*

fs/btrfs/extent_io.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,20 +1496,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
14961496
free_extent_map(em);
14971497
em = NULL;
14981498

1499+
/*
1500+
* Although the PageDirty bit might be cleared before entering
1501+
* this function, subpage dirty bit is not cleared.
1502+
* So clear subpage dirty bit here so next time we won't submit
1503+
* page for range already written to disk.
1504+
*/
1505+
btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
14991506
btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
15001507
if (!PageWriteback(page)) {
15011508
btrfs_err(inode->root->fs_info,
15021509
"page %lu not writeback, cur %llu end %llu",
15031510
page->index, cur, end);
15041511
}
15051512

1506-
/*
1507-
* Although the PageDirty bit is cleared before entering this
1508-
* function, subpage dirty bit is not cleared.
1509-
* So clear subpage dirty bit here so next time we won't submit
1510-
* page for range already written to disk.
1511-
*/
1512-
btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
15131513

15141514
submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
15151515
cur - page_offset(page));

fs/btrfs/extent_map.c

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,8 +1147,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11471147
return 0;
11481148

11491149
/*
1150-
* We want to be fast because we can be called from any path trying to
1151-
* allocate memory, so if the lock is busy we don't want to spend time
1150+
* We want to be fast so if the lock is busy we don't want to spend time
11521151
* waiting for it - either some task is about to do IO for the inode or
11531152
* we may have another task shrinking extent maps, here in this code, so
11541153
* skip this inode.
@@ -1191,9 +1190,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11911190
/*
11921191
* Stop if we need to reschedule or there's contention on the
11931192
* lock. This is to avoid slowing other tasks trying to take the
1194-
* lock and because the shrinker might be called during a memory
1195-
* allocation path and we want to avoid taking a very long time
1196-
* and slowing down all sorts of tasks.
1193+
* lock.
11971194
*/
11981195
if (need_resched() || rwlock_needbreak(&tree->lock))
11991196
break;
@@ -1222,12 +1219,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
12221219
if (ctx->scanned >= ctx->nr_to_scan)
12231220
break;
12241221

1225-
/*
1226-
* We may be called from memory allocation paths, so we don't
1227-
* want to take too much time and slowdown tasks.
1228-
*/
1229-
if (need_resched())
1230-
break;
1222+
cond_resched();
12311223

12321224
inode = btrfs_find_first_inode(root, min_ino);
12331225
}
@@ -1285,14 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
12851277
ctx.last_ino);
12861278
}
12871279

1288-
/*
1289-
* We may be called from memory allocation paths, so we don't want to
1290-
* take too much time and slowdown tasks, so stop if we need reschedule.
1291-
*/
1292-
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
1280+
while (ctx.scanned < ctx.nr_to_scan) {
12931281
struct btrfs_root *root;
12941282
unsigned long count;
12951283

1284+
cond_resched();
1285+
12961286
spin_lock(&fs_info->fs_roots_radix_lock);
12971287
count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
12981288
(void **)&root,

fs/btrfs/send.c

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx,
61576157
u64 offset = key->offset;
61586158
u64 end;
61596159
u64 bs = sctx->send_root->fs_info->sectorsize;
6160+
struct btrfs_file_extent_item *ei;
6161+
u64 disk_byte;
6162+
u64 data_offset;
6163+
u64 num_bytes;
6164+
struct btrfs_inode_info info = { 0 };
61606165

61616166
end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
61626167
if (offset >= end)
61636168
return 0;
61646169

6165-
if (clone_root && IS_ALIGNED(end, bs)) {
6166-
struct btrfs_file_extent_item *ei;
6167-
u64 disk_byte;
6168-
u64 data_offset;
6170+
num_bytes = end - offset;
61696171

6170-
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
6171-
struct btrfs_file_extent_item);
6172-
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
6173-
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
6174-
ret = clone_range(sctx, path, clone_root, disk_byte,
6175-
data_offset, offset, end - offset);
6176-
} else {
6177-
ret = send_extent_data(sctx, path, offset, end - offset);
6178-
}
6172+
if (!clone_root)
6173+
goto write_data;
6174+
6175+
if (IS_ALIGNED(end, bs))
6176+
goto clone_data;
6177+
6178+
/*
6179+
* If the extent end is not aligned, we can clone if the extent ends at
6180+
* the i_size of the inode and the clone range ends at the i_size of the
6181+
* source inode, otherwise the clone operation fails with -EINVAL.
6182+
*/
6183+
if (end != sctx->cur_inode_size)
6184+
goto write_data;
6185+
6186+
ret = get_inode_info(clone_root->root, clone_root->ino, &info);
6187+
if (ret < 0)
6188+
return ret;
6189+
6190+
if (clone_root->offset + num_bytes == info.size)
6191+
goto clone_data;
6192+
6193+
write_data:
6194+
ret = send_extent_data(sctx, path, offset, num_bytes);
6195+
sctx->cur_inode_next_write_offset = end;
6196+
return ret;
6197+
6198+
clone_data:
6199+
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
6200+
struct btrfs_file_extent_item);
6201+
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
6202+
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
6203+
ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset,
6204+
num_bytes);
61796205
sctx->cur_inode_next_write_offset = end;
61806206
return ret;
61816207
}

fs/btrfs/super.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <linux/btrfs.h>
2929
#include <linux/security.h>
3030
#include <linux/fs_parser.h>
31+
#include <linux/swap.h>
3132
#include "messages.h"
3233
#include "delayed-inode.h"
3334
#include "ctree.h"
@@ -2409,6 +2410,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
24092410
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
24102411
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
24112412

2413+
/*
2414+
* We may be called from any task trying to allocate memory and we don't
2415+
* want to slow it down with scanning and dropping extent maps. It would
2416+
* also cause heavy lock contention if many tasks concurrently enter
2417+
* here. Therefore only allow kswapd tasks to scan and drop extent maps.
2418+
*/
2419+
if (!current_is_kswapd())
2420+
return 0;
2421+
24122422
return btrfs_free_extent_maps(fs_info, nr_to_scan);
24132423
}
24142424

fs/btrfs/tree-checker.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf,
569569

570570
/* dir type check */
571571
dir_type = btrfs_dir_ftype(leaf, di);
572-
if (unlikely(dir_type >= BTRFS_FT_MAX)) {
572+
if (unlikely(dir_type <= BTRFS_FT_UNKNOWN ||
573+
dir_type >= BTRFS_FT_MAX)) {
573574
dir_item_err(leaf, slot,
574-
"invalid dir item type, have %u expect [0, %u)",
575+
"invalid dir item type, have %u expect (0, %u)",
575576
dir_type, BTRFS_FT_MAX);
576577
return -EUCLEAN;
577578
}

0 commit comments

Comments
 (0)