Skip to content

Commit 4484940

Browse files
fdmananakdave
authored andcommitted
btrfs: avoid races when tracking progress for extent map shrinking
We store the progress (root and inode numbers) of the extent map shrinker in fs_info without any synchronization but we can have multiple tasks calling into the shrinker during memory allocations when there's enough memory pressure for example. This can result in a task A reading fs_info->extent_map_shrinker_last_ino after another task B updates it, and task A reading fs_info->extent_map_shrinker_last_root before task B updates it, making task A see an odd state that isn't necessarily harmful but may make it skip certain inode ranges or do more work than necessary by going over the same inodes again. These unprotected accesses would also trigger warnings from tools like KCSAN. So add a lock to protect access to these progress fields. Reviewed-by: Josef Bacik <[email protected]> Signed-off-by: Filipe Manana <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent b3ebb9b commit 4484940

File tree

4 files changed

+76
-29
lines changed

4 files changed

+76
-29
lines changed

fs/btrfs/disk-io.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
28562856
if (ret)
28572857
return ret;
28582858

2859+
spin_lock_init(&fs_info->extent_map_shrinker_lock);
2860+
28592861
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
28602862
if (ret)
28612863
return ret;

fs/btrfs/extent_map.c

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,7 +1028,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
10281028
return ret;
10291029
}
10301030

1031-
static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
1031+
struct btrfs_em_shrink_ctx {
1032+
long nr_to_scan;
1033+
long scanned;
1034+
u64 last_ino;
1035+
u64 last_root;
1036+
};
1037+
1038+
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
10321039
{
10331040
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
10341041
struct extent_map_tree *tree = &inode->extent_tree;
@@ -1075,7 +1082,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
10751082

10761083
em = rb_entry(node, struct extent_map, rb_node);
10771084
node = rb_next(node);
1078-
(*scanned)++;
1085+
ctx->scanned++;
10791086

10801087
if (em->flags & EXTENT_FLAG_PINNED)
10811088
goto next;
@@ -1096,7 +1103,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
10961103
free_extent_map(em);
10971104
nr_dropped++;
10981105
next:
1099-
if (*scanned >= nr_to_scan)
1106+
if (ctx->scanned >= ctx->nr_to_scan)
11001107
break;
11011108

11021109
/*
@@ -1115,22 +1122,21 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
11151122
return nr_dropped;
11161123
}
11171124

1118-
static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
1125+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
11191126
{
1120-
struct btrfs_fs_info *fs_info = root->fs_info;
11211127
struct btrfs_inode *inode;
11221128
long nr_dropped = 0;
1123-
u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
1129+
u64 min_ino = ctx->last_ino + 1;
11241130

11251131
inode = btrfs_find_first_inode(root, min_ino);
11261132
while (inode) {
1127-
nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
1133+
nr_dropped += btrfs_scan_inode(inode, ctx);
11281134

11291135
min_ino = btrfs_ino(inode) + 1;
1130-
fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
1136+
ctx->last_ino = btrfs_ino(inode);
11311137
btrfs_add_delayed_iput(inode);
11321138

1133-
if (*scanned >= nr_to_scan)
1139+
if (ctx->scanned >= ctx->nr_to_scan)
11341140
break;
11351141

11361142
/*
@@ -1151,38 +1157,56 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
11511157
* inode if there is one or we will find out this was the last
11521158
* one and move to the next root.
11531159
*/
1154-
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
1160+
ctx->last_root = btrfs_root_id(root);
11551161
} else {
11561162
/*
11571163
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
11581164
* that when processing the next root we start from its first inode.
11591165
*/
1160-
fs_info->extent_map_shrinker_last_ino = 0;
1161-
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
1166+
ctx->last_ino = 0;
1167+
ctx->last_root = btrfs_root_id(root) + 1;
11621168
}
11631169

11641170
return nr_dropped;
11651171
}
11661172

11671173
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11681174
{
1169-
const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
1170-
u64 next_root_id = start_root_id;
1175+
struct btrfs_em_shrink_ctx ctx;
1176+
u64 start_root_id;
1177+
u64 next_root_id;
11711178
bool cycled = false;
11721179
long nr_dropped = 0;
1173-
long scanned = 0;
1180+
1181+
ctx.scanned = 0;
1182+
ctx.nr_to_scan = nr_to_scan;
1183+
1184+
/*
1185+
* In case we have multiple tasks running this shrinker, make the next
1186+
* one start from the next inode in case it starts before we finish.
1187+
*/
1188+
spin_lock(&fs_info->extent_map_shrinker_lock);
1189+
ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
1190+
fs_info->extent_map_shrinker_last_ino++;
1191+
ctx.last_root = fs_info->extent_map_shrinker_last_root;
1192+
spin_unlock(&fs_info->extent_map_shrinker_lock);
1193+
1194+
start_root_id = ctx.last_root;
1195+
next_root_id = ctx.last_root;
11741196

11751197
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
11761198
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
11771199

1178-
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
1200+
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
1201+
nr, ctx.last_root,
1202+
ctx.last_ino);
11791203
}
11801204

11811205
/*
11821206
* We may be called from memory allocation paths, so we don't want to
11831207
* take too much time and slowdown tasks, so stop if we need reschedule.
11841208
*/
1185-
while (scanned < nr_to_scan && !need_resched()) {
1209+
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
11861210
struct btrfs_root *root;
11871211
unsigned long count;
11881212

@@ -1194,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11941218
spin_unlock(&fs_info->fs_roots_radix_lock);
11951219
if (start_root_id > 0 && !cycled) {
11961220
next_root_id = 0;
1197-
fs_info->extent_map_shrinker_last_root = 0;
1198-
fs_info->extent_map_shrinker_last_ino = 0;
1221+
ctx.last_root = 0;
1222+
ctx.last_ino = 0;
11991223
cycled = true;
12001224
continue;
12011225
}
@@ -1209,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
12091233
continue;
12101234

12111235
if (is_fstree(btrfs_root_id(root)))
1212-
nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
1236+
nr_dropped += btrfs_scan_root(root, &ctx);
12131237

12141238
btrfs_put_root(root);
12151239
}
12161240

1241+
/*
1242+
* In case of multiple tasks running this extent map shrinking code this
1243+
* isn't perfect but it's simple and silences things like KCSAN. It's
1244+
* not possible to know which task made more progress because we can
1245+
* cycle back to the first root and first inode if it's not the first
1246+
* time the shrinker ran, see the above logic. Also a task that started
1247+
* later may finish ealier than another task and made less progress. So
1248+
* make this simple and update to the progress of the last task that
1249+
* finished, with the occasional possiblity of having two consecutive
1250+
* runs of the shrinker process the same inodes.
1251+
*/
1252+
spin_lock(&fs_info->extent_map_shrinker_lock);
1253+
fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
1254+
fs_info->extent_map_shrinker_last_root = ctx.last_root;
1255+
spin_unlock(&fs_info->extent_map_shrinker_lock);
1256+
12171257
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
12181258
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
12191259

1220-
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
1260+
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
1261+
nr, ctx.last_root,
1262+
ctx.last_ino);
12211263
}
12221264

12231265
return nr_dropped;

fs/btrfs/fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
630630
s32 delalloc_batch;
631631

632632
struct percpu_counter evictable_extent_maps;
633+
spinlock_t extent_map_shrinker_lock;
633634
u64 extent_map_shrinker_last_root;
634635
u64 extent_map_shrinker_last_ino;
635636

include/trace/events/btrfs.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
25562556

25572557
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
25582558

2559-
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
2559+
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
2560+
u64 last_root_id, u64 last_ino),
25602561

2561-
TP_ARGS(fs_info, nr_to_scan, nr),
2562+
TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
25622563

25632564
TP_STRUCT__entry_btrfs(
25642565
__field( long, nr_to_scan )
@@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
25702571
TP_fast_assign_btrfs(fs_info,
25712572
__entry->nr_to_scan = nr_to_scan;
25722573
__entry->nr = nr;
2573-
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
2574-
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
2574+
__entry->last_root_id = last_root_id;
2575+
__entry->last_ino = last_ino;
25752576
),
25762577

25772578
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
25812582

25822583
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
25832584

2584-
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
2585+
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
2586+
u64 last_root_id, u64 last_ino),
25852587

2586-
TP_ARGS(fs_info, nr_dropped, nr),
2588+
TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
25872589

25882590
TP_STRUCT__entry_btrfs(
25892591
__field( long, nr_dropped )
@@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
25952597
TP_fast_assign_btrfs(fs_info,
25962598
__entry->nr_dropped = nr_dropped;
25972599
__entry->nr = nr;
2598-
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
2599-
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
2600+
__entry->last_root_id = last_root_id;
2601+
__entry->last_ino = last_ino;
26002602
),
26012603

26022604
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",

0 commit comments

Comments
 (0)