Skip to content

Commit 07b5b8e

Browse files
riteshharjanitytso
authored andcommitted
ext4: mballoc: introduce pcpu seqcnt for freeing PA to improve ENOSPC handling
There could be a race in function ext4_mb_discard_group_preallocations() where the 1st thread may iterate through group's bb_prealloc_list and remove all the PAs and add to function's local list head. Now if the 2nd thread comes in to discard the group preallocations, it will see that the group->bb_prealloc_list is empty and will return 0. Consider for a case where we have less number of groups (for e.g. just group 0), this may even return an -ENOSPC error from ext4_mb_new_blocks() (where we call for ext4_mb_discard_group_preallocations()). But that is wrong, since 2nd thread should have waited for 1st thread to release all the PAs and should have retried for allocation. Since 1st thread was anyway going to discard the PAs. The algorithm using this percpu seq counter goes below: 1. We sample the percpu discard_pa_seq counter before trying for block allocation in ext4_mb_new_blocks(). 2. We increment this percpu discard_pa_seq counter when we either allocate or free these blocks i.e. while marking those blocks as used/free in mb_mark_used()/mb_free_blocks(). 3. We also increment this percpu seq counter when we successfully identify that the bb_prealloc_list is not empty and hence proceed for discarding of those PAs inside ext4_mb_discard_group_preallocations(). Now to make sure that the regular fast path of block allocation is not affected, as a small optimization we only sample the percpu seq counter on that cpu. Only when the block allocation fails and when freed blocks found were 0, that is when we sample percpu seq counter for all cpus using below function ext4_get_discard_pa_seq_sum(). This happens after making sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. It can be well argued that why don't just check for grp->bb_free to see if there are any free blocks to be allocated. So here are the two concerns which were discussed:- 1. If for some reason the blocks available in the group are not appropriate for allocation logic (say for e.g. EXT4_MB_HINT_GOAL_ONLY, although this is not yet implemented), then the retry logic may result into infinte looping since grp->bb_free is non-zero. 2. Also before preallocation was clubbed with block allocation with the same ext4_lock_group() held, there were lot of races where grp->bb_free could not be reliably relied upon. Due to above, this patch considers discard_pa_seq logic to determine if we should retry for block allocation. Say if there are are n threads trying for block allocation and none of those could allocate or discard any of the blocks, then all of those n threads will fail the block allocation and return -ENOSPC error. (Since the seq counter for all of those will match as no block allocation/discard was done during that duration). Signed-off-by: Ritesh Harjani <[email protected]> Link: https://lore.kernel.org/r/7f254686903b87c419d798742fd9a1be34f0657b.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o <[email protected]>
1 parent cf5e2ca commit 07b5b8e

File tree

1 file changed

+51
-5
lines changed

1 file changed

+51
-5
lines changed

fs/ext4/mballoc.c

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,35 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
351351
ext4_group_t group);
352352
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
353353

354+
/*
355+
* The algorithm using this percpu seq counter goes below:
356+
* 1. We sample the percpu discard_pa_seq counter before trying for block
357+
* allocation in ext4_mb_new_blocks().
358+
* 2. We increment this percpu discard_pa_seq counter when we either allocate
359+
* or free these blocks i.e. while marking those blocks as used/free in
360+
* mb_mark_used()/mb_free_blocks().
361+
* 3. We also increment this percpu seq counter when we successfully identify
362+
* that the bb_prealloc_list is not empty and hence proceed for discarding
363+
* of those PAs inside ext4_mb_discard_group_preallocations().
364+
*
365+
* Now to make sure that the regular fast path of block allocation is not
366+
* affected, as a small optimization we only sample the percpu seq counter
367+
* on that cpu. Only when the block allocation fails and when freed blocks
368+
* found were 0, that is when we sample percpu seq counter for all cpus using
369+
* below function ext4_get_discard_pa_seq_sum(). This happens after making
370+
* sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
371+
*/
372+
static DEFINE_PER_CPU(u64, discard_pa_seq);
373+
static inline u64 ext4_get_discard_pa_seq_sum(void)
374+
{
375+
int __cpu;
376+
u64 __seq = 0;
377+
378+
for_each_possible_cpu(__cpu)
379+
__seq += per_cpu(discard_pa_seq, __cpu);
380+
return __seq;
381+
}
382+
354383
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
355384
{
356385
#if BITS_PER_LONG == 64
@@ -1462,6 +1491,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
14621491
mb_check_buddy(e4b);
14631492
mb_free_blocks_double(inode, e4b, first, count);
14641493

1494+
this_cpu_inc(discard_pa_seq);
14651495
e4b->bd_info->bb_free += count;
14661496
if (first < e4b->bd_info->bb_first_free)
14671497
e4b->bd_info->bb_first_free = first;
@@ -1603,6 +1633,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
16031633
mb_check_buddy(e4b);
16041634
mb_mark_used_double(e4b, start, len);
16051635

1636+
this_cpu_inc(discard_pa_seq);
16061637
e4b->bd_info->bb_free -= len;
16071638
if (e4b->bd_info->bb_first_free == start)
16081639
e4b->bd_info->bb_first_free += len;
@@ -3962,6 +3993,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
39623993
INIT_LIST_HEAD(&list);
39633994
repeat:
39643995
ext4_lock_group(sb, group);
3996+
this_cpu_inc(discard_pa_seq);
39653997
list_for_each_entry_safe(pa, tmp,
39663998
&grp->bb_prealloc_list, pa_group_list) {
39673999
spin_lock(&pa->pa_lock);
@@ -4544,14 +4576,26 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
45444576
}
45454577

45464578
static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
4547-
struct ext4_allocation_context *ac)
4579+
struct ext4_allocation_context *ac, u64 *seq)
45484580
{
45494581
int freed;
4582+
u64 seq_retry = 0;
4583+
bool ret = false;
45504584

45514585
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4552-
if (freed)
4553-
return true;
4554-
return false;
4586+
if (freed) {
4587+
ret = true;
4588+
goto out_dbg;
4589+
}
4590+
seq_retry = ext4_get_discard_pa_seq_sum();
4591+
if (seq_retry != *seq) {
4592+
*seq = seq_retry;
4593+
ret = true;
4594+
}
4595+
4596+
out_dbg:
4597+
mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
4598+
return ret;
45554599
}
45564600

45574601
/*
@@ -4568,6 +4612,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
45684612
ext4_fsblk_t block = 0;
45694613
unsigned int inquota = 0;
45704614
unsigned int reserv_clstrs = 0;
4615+
u64 seq;
45714616

45724617
might_sleep();
45734618
sb = ar->inode->i_sb;
@@ -4630,6 +4675,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
46304675
}
46314676

46324677
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4678+
seq = *this_cpu_ptr(&discard_pa_seq);
46334679
if (!ext4_mb_use_preallocated(ac)) {
46344680
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
46354681
ext4_mb_normalize_request(ac, ar);
@@ -4666,7 +4712,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
46664712
ar->len = ac->ac_b_ex.fe_len;
46674713
}
46684714
} else {
4669-
if (ext4_mb_discard_preallocations_should_retry(sb, ac))
4715+
if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
46704716
goto repeat;
46714717
/*
46724718
* If block allocation fails then the pa allocated above

0 commit comments

Comments
 (0)