Skip to content

Commit 35278fb

Browse files
boryaskdave
authored andcommitted
btrfs: fix racy bitfield write in btrfs_clear_space_info_full()
From the memory-barriers.txt document regarding memory barrier ordering guarantees: (*) These guarantees do not apply to bitfields, because compilers often generate code to modify these using non-atomic read-modify-write sequences. Do not attempt to use bitfields to synchronize parallel algorithms. (*) Even in cases where bitfields are protected by locks, all fields in a given bitfield must be protected by one lock. If two fields in a given bitfield are protected by different locks, the compiler's non-atomic read-modify-write sequences can cause an update to one field to corrupt the value of an adjacent field. btrfs_space_info has a bitfield sharing an underlying word consisting of the fields full, chunk_alloc, and flush: struct btrfs_space_info { struct btrfs_fs_info * fs_info; /* 0 8 */ struct btrfs_space_info * parent; /* 8 8 */ ... int clamp; /* 172 4 */ unsigned int full:1; /* 176: 0 4 */ unsigned int chunk_alloc:1; /* 176: 1 4 */ unsigned int flush:1; /* 176: 2 4 */ ... Therefore, to be safe from parallel read-modify-writes losing a write to one of the bitfield members protected by a lock, all writes to all the bitfields must use the lock. They almost universally do, except for btrfs_clear_space_info_full() which iterates over the space_infos and writes out found->full = 0 without a lock. Imagine that we have one thread completing a transaction in which we finished deleting a block_group and are thus calling btrfs_clear_space_info_full() while simultaneously the data reclaim ticket infrastructure is running do_async_reclaim_data_space(): T1 T2 btrfs_commit_transaction btrfs_clear_space_info_full data_sinfo->full = 0 READ: full:0, chunk_alloc:0, flush:1 do_async_reclaim_data_space(data_sinfo) spin_lock(&space_info->lock); if(list_empty(tickets)) space_info->flush = 0; READ: full: 0, chunk_alloc:0, flush:1 MOD/WRITE: full: 0, chunk_alloc:0, flush:0 spin_unlock(&space_info->lock); return; MOD/WRITE: full:0, chunk_alloc:0, flush:1 and now data_sinfo->flush is 1 but the reclaim worker has exited. This breaks the invariant that flush is 0 iff there is no work queued or running. Once this invariant is violated, future allocations that go into __reserve_bytes() will add tickets to space_info->tickets but will see space_info->flush is set to 1 and not queue the work. After this, they will block forever on the resulting ticket, as it is now impossible to kick the worker again. I also confirmed by looking at the assembly of the affected kernel that it is doing RMW operations. For example, to set the flush (3rd) bit to 0, the assembly is: andb $0xfb,0x60(%rbx) and similarly for setting the full (1st) bit to 0: andb $0xfe,-0x20(%rax) So I think this is really a bug on practical systems. I have observed a number of systems in this exact state, but am currently unable to reproduce it. Rather than leaving this footgun lying around for the future, take advantage of the fact that there is room in the struct anyway, and that it is already quite large and simply change the three bitfield members to bools. This avoids writes to space_info->full having any effect on writes to space_info->flush, regardless of locking. Fixes: 957780e ("Btrfs: introduce ticketed enospc infrastructure") Reviewed-by: Qu Wenruo <[email protected]> Signed-off-by: Boris Burkov <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 2bf96b5 commit 35278fb

File tree

3 files changed

+17
-17
lines changed

3 files changed

+17
-17
lines changed

fs/btrfs/block-group.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4215,7 +4215,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
42154215
mutex_unlock(&fs_info->chunk_mutex);
42164216
} else {
42174217
/* Proceed with allocation */
4218-
space_info->chunk_alloc = 1;
4218+
space_info->chunk_alloc = true;
42194219
wait_for_alloc = false;
42204220
spin_unlock(&space_info->lock);
42214221
}
@@ -4264,7 +4264,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
42644264
spin_lock(&space_info->lock);
42654265
if (ret < 0) {
42664266
if (ret == -ENOSPC)
4267-
space_info->full = 1;
4267+
space_info->full = true;
42684268
else
42694269
goto out;
42704270
} else {
@@ -4274,7 +4274,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
42744274

42754275
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
42764276
out:
4277-
space_info->chunk_alloc = 0;
4277+
space_info->chunk_alloc = false;
42784278
spin_unlock(&space_info->lock);
42794279
mutex_unlock(&fs_info->chunk_mutex);
42804280

fs/btrfs/space-info.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
192192
struct btrfs_space_info *found;
193193

194194
list_for_each_entry(found, head, list)
195-
found->full = 0;
195+
found->full = false;
196196
}
197197

198198
/*
@@ -372,7 +372,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
372372
space_info->bytes_readonly += block_group->bytes_super;
373373
btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable);
374374
if (block_group->length > 0)
375-
space_info->full = 0;
375+
space_info->full = false;
376376
btrfs_try_granting_tickets(info, space_info);
377377
spin_unlock(&space_info->lock);
378378

@@ -1146,7 +1146,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
11461146
spin_lock(&space_info->lock);
11471147
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
11481148
if (!to_reclaim) {
1149-
space_info->flush = 0;
1149+
space_info->flush = false;
11501150
spin_unlock(&space_info->lock);
11511151
return;
11521152
}
@@ -1158,7 +1158,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
11581158
flush_space(fs_info, space_info, to_reclaim, flush_state, false);
11591159
spin_lock(&space_info->lock);
11601160
if (list_empty(&space_info->tickets)) {
1161-
space_info->flush = 0;
1161+
space_info->flush = false;
11621162
spin_unlock(&space_info->lock);
11631163
return;
11641164
}
@@ -1201,7 +1201,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
12011201
flush_state = FLUSH_DELAYED_ITEMS_NR;
12021202
commit_cycles--;
12031203
} else {
1204-
space_info->flush = 0;
1204+
space_info->flush = false;
12051205
}
12061206
} else {
12071207
flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -1383,7 +1383,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
13831383

13841384
spin_lock(&space_info->lock);
13851385
if (list_empty(&space_info->tickets)) {
1386-
space_info->flush = 0;
1386+
space_info->flush = false;
13871387
spin_unlock(&space_info->lock);
13881388
return;
13891389
}
@@ -1394,7 +1394,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
13941394
flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
13951395
spin_lock(&space_info->lock);
13961396
if (list_empty(&space_info->tickets)) {
1397-
space_info->flush = 0;
1397+
space_info->flush = false;
13981398
spin_unlock(&space_info->lock);
13991399
return;
14001400
}
@@ -1411,7 +1411,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
14111411
data_flush_states[flush_state], false);
14121412
spin_lock(&space_info->lock);
14131413
if (list_empty(&space_info->tickets)) {
1414-
space_info->flush = 0;
1414+
space_info->flush = false;
14151415
spin_unlock(&space_info->lock);
14161416
return;
14171417
}
@@ -1428,7 +1428,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
14281428
if (maybe_fail_all_tickets(fs_info, space_info))
14291429
flush_state = 0;
14301430
else
1431-
space_info->flush = 0;
1431+
space_info->flush = false;
14321432
} else {
14331433
flush_state = 0;
14341434
}
@@ -1444,7 +1444,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
14441444

14451445
aborted_fs:
14461446
maybe_fail_all_tickets(fs_info, space_info);
1447-
space_info->flush = 0;
1447+
space_info->flush = false;
14481448
spin_unlock(&space_info->lock);
14491449
}
14501450

@@ -1825,7 +1825,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
18251825
*/
18261826
maybe_clamp_preempt(fs_info, space_info);
18271827

1828-
space_info->flush = 1;
1828+
space_info->flush = true;
18291829
trace_btrfs_trigger_flush(fs_info,
18301830
space_info->flags,
18311831
orig_bytes, flush,

fs/btrfs/space-info.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,11 @@ struct btrfs_space_info {
142142
flushing. The value is >> clamp, so turns
143143
out to be a 2^clamp divisor. */
144144

145-
unsigned int full:1; /* indicates that we cannot allocate any more
145+
bool full; /* indicates that we cannot allocate any more
146146
chunks for this space */
147-
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
147+
bool chunk_alloc; /* set if we are allocating a chunk */
148148

149-
unsigned int flush:1; /* set if we are trying to make space */
149+
bool flush; /* set if we are trying to make space */
150150

151151
unsigned int force_alloc; /* set if we need to force a chunk
152152
alloc for this space */

0 commit comments

Comments
 (0)