Skip to content

Commit 27bc446

Browse files
brookxu-cntytso
authored andcommitted
ext4: limit the length of per-inode prealloc list
In the scenario of writing sparse files, the per-inode prealloc list may be very long, resulting in high overhead for ext4_mb_use_preallocated(). To circumvent this problem, we limit the maximum length of per-inode prealloc list to 512 and allow users to modify it. After patching, we observed that the sys ratio of cpu has dropped, and the system throughput has increased significantly. We created a process to write the sparse file, and the running time of the process on the fixed kernel was significantly reduced, as follows: Running time on unfixed kernel: [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat real 0m2.051s user 0m0.008s sys 0m2.026s Running time on fixed kernel: [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat real 0m0.471s user 0m0.004s sys 0m0.395s Signed-off-by: Chunguang Xu <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent 66d5e02 commit 27bc446

File tree

13 files changed

+104
-29
lines changed

13 files changed

+104
-29
lines changed

Documentation/admin-guide/ext4.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,9 @@ Files in /sys/fs/ext4/<devname>:
482482
multiple of this tuning parameter if the stripe size is not set in the
483483
ext4 superblock
484484

485+
mb_max_inode_prealloc
486+
The maximum length of per-inode ext4_prealloc_space list.
487+
485488
mb_max_to_scan
486489
The maximum number of extents the multiblock allocator will search to
487490
find the best extent.

fs/ext4/ext4.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,7 @@ struct ext4_inode_info {
10701070
struct timespec64 i_crtime;
10711071

10721072
/* mballoc */
1073+
atomic_t i_prealloc_active;
10731074
struct list_head i_prealloc_list;
10741075
spinlock_t i_prealloc_lock;
10751076

@@ -1518,6 +1519,7 @@ struct ext4_sb_info {
15181519
unsigned int s_mb_stats;
15191520
unsigned int s_mb_order2_reqs;
15201521
unsigned int s_mb_group_prealloc;
1522+
unsigned int s_mb_max_inode_prealloc;
15211523
unsigned int s_max_dir_size_kb;
15221524
/* where last allocation was done - for stream allocation */
15231525
unsigned long s_mb_last_group;
@@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
26822684
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
26832685
struct ext4_allocation_request *, int *);
26842686
extern int ext4_mb_reserve_blocks(struct super_block *, int);
2685-
extern void ext4_discard_preallocations(struct inode *);
2687+
extern void ext4_discard_preallocations(struct inode *, unsigned int);
26862688
extern int __init ext4_init_mballoc(void);
26872689
extern void ext4_exit_mballoc(void);
26882690
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,

fs/ext4/extents.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
100100
* i_mutex. So we can safely drop the i_data_sem here.
101101
*/
102102
BUG_ON(EXT4_JOURNAL(inode) == NULL);
103-
ext4_discard_preallocations(inode);
103+
ext4_discard_preallocations(inode, 0);
104104
up_write(&EXT4_I(inode)->i_data_sem);
105105
*dropped = 1;
106106
return 0;
@@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
42664266
* not a good idea to call discard here directly,
42674267
* but otherwise we'd need to call it every free().
42684268
*/
4269-
ext4_discard_preallocations(inode);
4269+
ext4_discard_preallocations(inode, 0);
42704270
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
42714271
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
42724272
ext4_free_blocks(handle, inode, NULL, newblock,
@@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
52935293
}
52945294

52955295
down_write(&EXT4_I(inode)->i_data_sem);
5296-
ext4_discard_preallocations(inode);
5296+
ext4_discard_preallocations(inode, 0);
52975297

52985298
ret = ext4_es_remove_extent(inode, punch_start,
52995299
EXT_MAX_BLOCKS - punch_start);
@@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
53075307
up_write(&EXT4_I(inode)->i_data_sem);
53085308
goto out_stop;
53095309
}
5310-
ext4_discard_preallocations(inode);
5310+
ext4_discard_preallocations(inode, 0);
53115311

53125312
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
53135313
punch_stop - punch_start, SHIFT_LEFT);
@@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
54395439
goto out_stop;
54405440

54415441
down_write(&EXT4_I(inode)->i_data_sem);
5442-
ext4_discard_preallocations(inode);
5442+
ext4_discard_preallocations(inode, 0);
54435443

54445444
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
54455445
if (IS_ERR(path)) {

fs/ext4/file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
147147
(atomic_read(&inode->i_writecount) == 1) &&
148148
!EXT4_I(inode)->i_reserved_data_blocks) {
149149
down_write(&EXT4_I(inode)->i_data_sem);
150-
ext4_discard_preallocations(inode);
150+
ext4_discard_preallocations(inode, 0);
151151
up_write(&EXT4_I(inode)->i_data_sem);
152152
}
153153
if (is_dx(inode) && filp->private_data)

fs/ext4/indirect.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
696696
* i_mutex. So we can safely drop the i_data_sem here.
697697
*/
698698
BUG_ON(EXT4_JOURNAL(inode) == NULL);
699-
ext4_discard_preallocations(inode);
699+
ext4_discard_preallocations(inode, 0);
700700
up_write(&EXT4_I(inode)->i_data_sem);
701701
*dropped = 1;
702702
return 0;

fs/ext4/inode.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
383383
*/
384384
if ((ei->i_reserved_data_blocks == 0) &&
385385
!inode_is_open_for_write(inode))
386-
ext4_discard_preallocations(inode);
386+
ext4_discard_preallocations(inode, 0);
387387
}
388388

389389
static int __check_block_validity(struct inode *inode, const char *func,
@@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
40554055
if (stop_block > first_block) {
40564056

40574057
down_write(&EXT4_I(inode)->i_data_sem);
4058-
ext4_discard_preallocations(inode);
4058+
ext4_discard_preallocations(inode, 0);
40594059

40604060
ret = ext4_es_remove_extent(inode, first_block,
40614061
stop_block - first_block);
@@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
42104210

42114211
down_write(&EXT4_I(inode)->i_data_sem);
42124212

4213-
ext4_discard_preallocations(inode);
4213+
ext4_discard_preallocations(inode, 0);
42144214

42154215
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
42164216
err = ext4_ext_truncate(handle, inode);

fs/ext4/ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
202202
reset_inode_seed(inode);
203203
reset_inode_seed(inode_bl);
204204

205-
ext4_discard_preallocations(inode);
205+
ext4_discard_preallocations(inode, 0);
206206

207207
err = ext4_mark_inode_dirty(handle, inode);
208208
if (err < 0) {

fs/ext4/mballoc.c

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
28782878
sbi->s_mb_stats = MB_DEFAULT_STATS;
28792879
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
28802880
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2881+
sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
28812882
/*
28822883
* The default group preallocation is 512, which for 4k block
28832884
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
38163817
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
38173818
}
38183819

3820+
static void ext4_mb_mark_pa_deleted(struct super_block *sb,
3821+
struct ext4_prealloc_space *pa)
3822+
{
3823+
struct ext4_inode_info *ei;
3824+
3825+
if (pa->pa_deleted) {
3826+
ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
3827+
pa->pa_type, pa->pa_pstart, pa->pa_lstart,
3828+
pa->pa_len);
3829+
return;
3830+
}
3831+
3832+
pa->pa_deleted = 1;
3833+
3834+
if (pa->pa_type == MB_INODE_PA) {
3835+
ei = EXT4_I(pa->pa_inode);
3836+
atomic_dec(&ei->i_prealloc_active);
3837+
}
3838+
}
3839+
38193840
static void ext4_mb_pa_callback(struct rcu_head *head)
38203841
{
38213842
struct ext4_prealloc_space *pa;
@@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
38483869
return;
38493870
}
38503871

3851-
pa->pa_deleted = 1;
3872+
ext4_mb_mark_pa_deleted(sb, pa);
38523873
spin_unlock(&pa->pa_lock);
38533874

38543875
grp_blk = pa->pa_pstart;
@@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
39723993
spin_lock(pa->pa_obj_lock);
39733994
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
39743995
spin_unlock(pa->pa_obj_lock);
3996+
atomic_inc(&ei->i_prealloc_active);
39753997
}
39763998

39773999
/*
@@ -4182,7 +4204,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
41824204
}
41834205

41844206
/* seems this one can be freed ... */
4185-
pa->pa_deleted = 1;
4207+
ext4_mb_mark_pa_deleted(sb, pa);
41864208

41874209
/* we can trust pa_free ... */
41884210
free += pa->pa_free;
@@ -4245,7 +4267,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
42454267
*
42464268
* FIXME!! Make sure it is valid at all the call sites
42474269
*/
4248-
void ext4_discard_preallocations(struct inode *inode)
4270+
void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
42494271
{
42504272
struct ext4_inode_info *ei = EXT4_I(inode);
42514273
struct super_block *sb = inode->i_sb;
@@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
42634285

42644286
mb_debug(sb, "discard preallocation for inode %lu\n",
42654287
inode->i_ino);
4266-
trace_ext4_discard_preallocations(inode);
4288+
trace_ext4_discard_preallocations(inode,
4289+
atomic_read(&ei->i_prealloc_active), needed);
42674290

42684291
INIT_LIST_HEAD(&list);
42694292

4293+
if (needed == 0)
4294+
needed = UINT_MAX;
4295+
42704296
repeat:
42714297
/* first, collect all pa's in the inode */
42724298
spin_lock(&ei->i_prealloc_lock);
4273-
while (!list_empty(&ei->i_prealloc_list)) {
4274-
pa = list_entry(ei->i_prealloc_list.next,
4299+
while (!list_empty(&ei->i_prealloc_list) && needed) {
4300+
pa = list_entry(ei->i_prealloc_list.prev,
42754301
struct ext4_prealloc_space, pa_inode_list);
42764302
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
42774303
spin_lock(&pa->pa_lock);
@@ -4288,10 +4314,11 @@ void ext4_discard_preallocations(struct inode *inode)
42884314

42894315
}
42904316
if (pa->pa_deleted == 0) {
4291-
pa->pa_deleted = 1;
4317+
ext4_mb_mark_pa_deleted(sb, pa);
42924318
spin_unlock(&pa->pa_lock);
42934319
list_del_rcu(&pa->pa_inode_list);
42944320
list_add(&pa->u.pa_tmp_list, &list);
4321+
needed--;
42954322
continue;
42964323
}
42974324

@@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
45924619
BUG_ON(pa->pa_type != MB_GROUP_PA);
45934620

45944621
/* seems this one can be freed ... */
4595-
pa->pa_deleted = 1;
4622+
ext4_mb_mark_pa_deleted(sb, pa);
45964623
spin_unlock(&pa->pa_lock);
45974624

45984625
list_del_rcu(&pa->pa_inode_list);
@@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
46904717
return ;
46914718
}
46924719

4720+
/*
4721+
* if per-inode prealloc list is too long, trim some PA
4722+
*/
4723+
static void ext4_mb_trim_inode_pa(struct inode *inode)
4724+
{
4725+
struct ext4_inode_info *ei = EXT4_I(inode);
4726+
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4727+
int count, delta;
4728+
4729+
count = atomic_read(&ei->i_prealloc_active);
4730+
delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
4731+
if (count > sbi->s_mb_max_inode_prealloc + delta) {
4732+
count -= sbi->s_mb_max_inode_prealloc;
4733+
ext4_discard_preallocations(inode, count);
4734+
}
4735+
}
4736+
46934737
/*
46944738
* release all resource we used in allocation
46954739
*/
46964740
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
46974741
{
4742+
struct inode *inode = ac->ac_inode;
4743+
struct ext4_inode_info *ei = EXT4_I(inode);
46984744
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
46994745
struct ext4_prealloc_space *pa = ac->ac_pa;
47004746
if (pa) {
@@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
47204766
ext4_mb_add_n_trim(ac);
47214767
}
47224768
}
4769+
4770+
if (pa->pa_type == MB_INODE_PA) {
4771+
/*
4772+
* treat per-inode prealloc list as a lru list, then try
4773+
* to trim the least recently used PA.
4774+
*/
4775+
spin_lock(pa->pa_obj_lock);
4776+
list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
4777+
spin_unlock(pa->pa_obj_lock);
4778+
}
4779+
47234780
ext4_mb_put_pa(ac, ac->ac_sb, pa);
47244781
}
47254782
if (ac->ac_bitmap_page)
@@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
47294786
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
47304787
mutex_unlock(&ac->ac_lg->lg_mutex);
47314788
ext4_mb_collect_stats(ac);
4789+
ext4_mb_trim_inode_pa(inode);
47324790
return 0;
47334791
}
47344792

fs/ext4/mballoc.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@
7373
*/
7474
#define MB_DEFAULT_GROUP_PREALLOC 512
7575

76+
/*
77+
* maximum length of inode prealloc list
78+
*/
79+
#define MB_DEFAULT_MAX_INODE_PREALLOC 512
7680

7781
struct ext4_free_data {
7882
/* this links the free block information from sb_info */

fs/ext4/move_extent.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
686686

687687
out:
688688
if (*moved_len) {
689-
ext4_discard_preallocations(orig_inode);
690-
ext4_discard_preallocations(donor_inode);
689+
ext4_discard_preallocations(orig_inode, 0);
690+
ext4_discard_preallocations(donor_inode, 0);
691691
}
692692

693693
ext4_ext_drop_refs(path);

0 commit comments

Comments
 (0)