Skip to content

Commit 975f3b6

Browse files
committed
Merge tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "Fix a regression in extent map shrinker behaviour. In the past weeks we got reports from users that there are huge latency spikes or freezes. This was bisected to newly added shrinker of extent maps (it was added to fix a build up of the structures in memory). I'm assuming that the freezes would happen to many users after release so I'd like to get it merged now so it's in 6.10. Although the diff size is not small the changes are relatively straightforward, the reporters verified the fixes and we did testing on our side. The fixes: - adjust behaviour under memory pressure and check lock or scheduling conditions, bail out if needed - synchronize tracking of the scanning progress so inode ranges are not skipped or work duplicated - do a delayed iput when scanning a root so evicting an inode does not slow things down in case of lots of dirty data, also fix lockdep warning, a deadlock could happen when writing the dirty data would need to start a transaction" * tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: avoid races when tracking progress for extent map shrinking btrfs: stop extent map shrinker if reschedule is needed btrfs: use delayed iput during extent map shrinking
2 parents a52ff90 + 4484940 commit 975f3b6

File tree

4 files changed

+107
-37
lines changed

4 files changed

+107
-37
lines changed

fs/btrfs/disk-io.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
28562856
if (ret)
28572857
return ret;
28582858

2859+
spin_lock_init(&fs_info->extent_map_shrinker_lock);
2860+
28592861
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
28602862
if (ret)
28612863
return ret;

fs/btrfs/extent_map.c

Lines changed: 94 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,7 +1028,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
10281028
return ret;
10291029
}
10301030

1031-
static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
1031+
struct btrfs_em_shrink_ctx {
1032+
long nr_to_scan;
1033+
long scanned;
1034+
u64 last_ino;
1035+
u64 last_root;
1036+
};
1037+
1038+
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
10321039
{
10331040
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
10341041
struct extent_map_tree *tree = &inode->extent_tree;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
10571064
if (!down_read_trylock(&inode->i_mmap_lock))
10581065
return 0;
10591066

1060-
write_lock(&tree->lock);
1067+
/*
1068+
* We want to be fast because we can be called from any path trying to
1069+
* allocate memory, so if the lock is busy we don't want to spend time
1070+
* waiting for it - either some task is about to do IO for the inode or
1071+
* we may have another task shrinking extent maps, here in this code, so
1072+
* skip this inode.
1073+
*/
1074+
if (!write_trylock(&tree->lock)) {
1075+
up_read(&inode->i_mmap_lock);
1076+
return 0;
1077+
}
1078+
10611079
node = rb_first_cached(&tree->map);
10621080
while (node) {
10631081
struct extent_map *em;
10641082

10651083
em = rb_entry(node, struct extent_map, rb_node);
10661084
node = rb_next(node);
1067-
(*scanned)++;
1085+
ctx->scanned++;
10681086

10691087
if (em->flags & EXTENT_FLAG_PINNED)
10701088
goto next;
@@ -1085,42 +1103,49 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
10851103
free_extent_map(em);
10861104
nr_dropped++;
10871105
next:
1088-
if (*scanned >= nr_to_scan)
1106+
if (ctx->scanned >= ctx->nr_to_scan)
10891107
break;
10901108

10911109
/*
1092-
* Restart if we had to reschedule, and any extent maps that were
1093-
* pinned before may have become unpinned after we released the
1094-
* lock and took it again.
1110+
* Stop if we need to reschedule or there's contention on the
1111+
* lock. This is to avoid slowing other tasks trying to take the
1112+
* lock and because the shrinker might be called during a memory
1113+
* allocation path and we want to avoid taking a very long time
1114+
* and slowing down all sorts of tasks.
10951115
*/
1096-
if (cond_resched_rwlock_write(&tree->lock))
1097-
node = rb_first_cached(&tree->map);
1116+
if (need_resched() || rwlock_needbreak(&tree->lock))
1117+
break;
10981118
}
10991119
write_unlock(&tree->lock);
11001120
up_read(&inode->i_mmap_lock);
11011121

11021122
return nr_dropped;
11031123
}
11041124

1105-
static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
1125+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
11061126
{
1107-
struct btrfs_fs_info *fs_info = root->fs_info;
11081127
struct btrfs_inode *inode;
11091128
long nr_dropped = 0;
1110-
u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
1129+
u64 min_ino = ctx->last_ino + 1;
11111130

11121131
inode = btrfs_find_first_inode(root, min_ino);
11131132
while (inode) {
1114-
nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
1133+
nr_dropped += btrfs_scan_inode(inode, ctx);
11151134

11161135
min_ino = btrfs_ino(inode) + 1;
1117-
fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
1118-
iput(&inode->vfs_inode);
1136+
ctx->last_ino = btrfs_ino(inode);
1137+
btrfs_add_delayed_iput(inode);
11191138

1120-
if (*scanned >= nr_to_scan)
1139+
if (ctx->scanned >= ctx->nr_to_scan)
1140+
break;
1141+
1142+
/*
1143+
* We may be called from memory allocation paths, so we don't
1144+
* want to take too much time and slowdown tasks.
1145+
*/
1146+
if (need_resched())
11211147
break;
11221148

1123-
cond_resched();
11241149
inode = btrfs_find_first_inode(root, min_ino);
11251150
}
11261151

@@ -1132,34 +1157,56 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
11321157
* inode if there is one or we will find out this was the last
11331158
* one and move to the next root.
11341159
*/
1135-
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
1160+
ctx->last_root = btrfs_root_id(root);
11361161
} else {
11371162
/*
11381163
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
11391164
* that when processing the next root we start from its first inode.
11401165
*/
1141-
fs_info->extent_map_shrinker_last_ino = 0;
1142-
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
1166+
ctx->last_ino = 0;
1167+
ctx->last_root = btrfs_root_id(root) + 1;
11431168
}
11441169

11451170
return nr_dropped;
11461171
}
11471172

11481173
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11491174
{
1150-
const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
1151-
u64 next_root_id = start_root_id;
1175+
struct btrfs_em_shrink_ctx ctx;
1176+
u64 start_root_id;
1177+
u64 next_root_id;
11521178
bool cycled = false;
11531179
long nr_dropped = 0;
1154-
long scanned = 0;
1180+
1181+
ctx.scanned = 0;
1182+
ctx.nr_to_scan = nr_to_scan;
1183+
1184+
/*
1185+
* In case we have multiple tasks running this shrinker, make the next
1186+
* one start from the next inode in case it starts before we finish.
1187+
*/
1188+
spin_lock(&fs_info->extent_map_shrinker_lock);
1189+
ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
1190+
fs_info->extent_map_shrinker_last_ino++;
1191+
ctx.last_root = fs_info->extent_map_shrinker_last_root;
1192+
spin_unlock(&fs_info->extent_map_shrinker_lock);
1193+
1194+
start_root_id = ctx.last_root;
1195+
next_root_id = ctx.last_root;
11551196

11561197
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
11571198
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
11581199

1159-
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
1200+
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
1201+
nr, ctx.last_root,
1202+
ctx.last_ino);
11601203
}
11611204

1162-
while (scanned < nr_to_scan) {
1205+
/*
1206+
* We may be called from memory allocation paths, so we don't want to
1207+
* take too much time and slowdown tasks, so stop if we need reschedule.
1208+
*/
1209+
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
11631210
struct btrfs_root *root;
11641211
unsigned long count;
11651212

@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11711218
spin_unlock(&fs_info->fs_roots_radix_lock);
11721219
if (start_root_id > 0 && !cycled) {
11731220
next_root_id = 0;
1174-
fs_info->extent_map_shrinker_last_root = 0;
1175-
fs_info->extent_map_shrinker_last_ino = 0;
1221+
ctx.last_root = 0;
1222+
ctx.last_ino = 0;
11761223
cycled = true;
11771224
continue;
11781225
}
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11861233
continue;
11871234

11881235
if (is_fstree(btrfs_root_id(root)))
1189-
nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
1236+
nr_dropped += btrfs_scan_root(root, &ctx);
11901237

11911238
btrfs_put_root(root);
11921239
}
11931240

1241+
/*
1242+
* In case of multiple tasks running this extent map shrinking code this
1243+
* isn't perfect but it's simple and silences things like KCSAN. It's
1244+
* not possible to know which task made more progress because we can
1245+
* cycle back to the first root and first inode if it's not the first
1246+
* time the shrinker ran, see the above logic. Also a task that started
1247+
* later may finish ealier than another task and made less progress. So
1248+
* make this simple and update to the progress of the last task that
1249+
* finished, with the occasional possiblity of having two consecutive
1250+
* runs of the shrinker process the same inodes.
1251+
*/
1252+
spin_lock(&fs_info->extent_map_shrinker_lock);
1253+
fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
1254+
fs_info->extent_map_shrinker_last_root = ctx.last_root;
1255+
spin_unlock(&fs_info->extent_map_shrinker_lock);
1256+
11941257
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
11951258
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
11961259

1197-
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
1260+
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
1261+
nr, ctx.last_root,
1262+
ctx.last_ino);
11981263
}
11991264

12001265
return nr_dropped;

fs/btrfs/fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
630630
s32 delalloc_batch;
631631

632632
struct percpu_counter evictable_extent_maps;
633+
spinlock_t extent_map_shrinker_lock;
633634
u64 extent_map_shrinker_last_root;
634635
u64 extent_map_shrinker_last_ino;
635636

include/trace/events/btrfs.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
25562556

25572557
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
25582558

2559-
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
2559+
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
2560+
u64 last_root_id, u64 last_ino),
25602561

2561-
TP_ARGS(fs_info, nr_to_scan, nr),
2562+
TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
25622563

25632564
TP_STRUCT__entry_btrfs(
25642565
__field( long, nr_to_scan )
@@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
25702571
TP_fast_assign_btrfs(fs_info,
25712572
__entry->nr_to_scan = nr_to_scan;
25722573
__entry->nr = nr;
2573-
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
2574-
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
2574+
__entry->last_root_id = last_root_id;
2575+
__entry->last_ino = last_ino;
25752576
),
25762577

25772578
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
25812582

25822583
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
25832584

2584-
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
2585+
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
2586+
u64 last_root_id, u64 last_ino),
25852587

2586-
TP_ARGS(fs_info, nr_dropped, nr),
2588+
TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
25872589

25882590
TP_STRUCT__entry_btrfs(
25892591
__field( long, nr_dropped )
@@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
25952597
TP_fast_assign_btrfs(fs_info,
25962598
__entry->nr_dropped = nr_dropped;
25972599
__entry->nr = nr;
2598-
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
2599-
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
2600+
__entry->last_root_id = last_root_id;
2601+
__entry->last_ino = last_ino;
26002602
),
26012603

26022604
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",

0 commit comments

Comments
 (0)