Skip to content

Commit 196e402

Browse files
harshadjstytso
authored andcommitted
ext4: improve cr 0 / cr 1 group scanning
Instead of traversing through groups linearly, scan groups in specific orders at cr 0 and cr 1. At cr 0, we want to find groups that have the largest free order >= the order of the request. So, with this patch, we maintain lists for each possible order and insert each group into a list based on the largest free order in its buddy bitmap. During cr 0 allocation, we traverse these lists in the increasing order of largest free orders. This allows us to find a group with the best available cr 0 match in constant time. If nothing can be found, we fallback to cr 1 immediately. At CR1, the story is slightly different. We want to traverse in the order of increasing average fragment size. For CR1, we maintain a rb tree of groupinfos which is sorted by average fragment size. Instead of traversing linearly, at CR1, we traverse in the order of increasing average fragment size, starting at the most optimal group. This brings down cr 1 search complexity to log(num groups). For cr >= 2, we just perform the linear search as before. Also, in case of lock contention, we intermittently fallback to linear search even in CR 0 and CR 1 cases. This allows us to proceed during the allocation path even in case of high contention. There is an opportunity to do optimization at CR2 too. That's because at CR2 we only consider groups where bb_free counter (number of free blocks) is greater than the request extent size. That's left as future work. All the changes introduced in this patch are protected under a new mount option "mb_optimize_scan". With this patchset, following experiment was performed: Created a highly fragmented disk of size 65TB. The disk had no contiguous 2M regions. Following command was run consecutively for 3 times: time dd if=/dev/urandom of=file bs=2M count=10 Here are the results with and without cr 0/1 optimizations introduced in this patch: |---------+------------------------------+---------------------------| | | Without CR 0/1 Optimizations | With CR 0/1 Optimizations | |---------+------------------------------+---------------------------| | 1st run | 5m1.871s | 2m47.642s | | 2nd run | 2m28.390s | 0m0.611s | | 3rd run | 2m26.530s | 0m1.255s | |---------+------------------------------+---------------------------| Signed-off-by: Harshad Shirwadkar <[email protected]> Reported-by: kernel test robot <[email protected]> Reported-by: Dan Carpenter <[email protected]> Reviewed-by: Andreas Dilger <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent 4b68f6d commit 196e402

File tree

5 files changed

+452
-15
lines changed

5 files changed

+452
-15
lines changed

fs/ext4/ext4.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,12 @@ enum SHIFT_DIRECTION {
162162
#define EXT4_MB_USE_RESERVED 0x2000
163163
/* Do strict check for free blocks while retrying block allocation */
164164
#define EXT4_MB_STRICT_CHECK 0x4000
165-
165+
/* Large fragment size list lookup succeeded at least once for cr = 0 */
166+
#define EXT4_MB_CR0_OPTIMIZED 0x8000
167+
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
168+
#define EXT4_MB_CR1_OPTIMIZED 0x00010000
169+
/* Perform linear traversal for one group */
170+
#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
166171
struct ext4_allocation_request {
167172
/* target inode for block we're allocating */
168173
struct inode *inode;
@@ -1247,7 +1252,9 @@ struct ext4_inode_info {
12471252
#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
12481253
#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
12491254
#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
1250-
1255+
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
1256+
* scanning in mballoc
1257+
*/
12511258

12521259
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
12531260
~EXT4_MOUNT_##opt
@@ -1528,9 +1535,14 @@ struct ext4_sb_info {
15281535
unsigned int s_mb_free_pending;
15291536
struct list_head s_freed_data_list; /* List of blocks to be freed
15301537
after commit completed */
1538+
struct rb_root s_mb_avg_fragment_size_root;
1539+
rwlock_t s_mb_rb_lock;
1540+
struct list_head *s_mb_largest_free_orders;
1541+
rwlock_t *s_mb_largest_free_orders_locks;
15311542

15321543
/* tunables */
15331544
unsigned long s_stripe;
1545+
unsigned int s_mb_max_linear_groups;
15341546
unsigned int s_mb_stream_request;
15351547
unsigned int s_mb_max_to_scan;
15361548
unsigned int s_mb_min_to_scan;
@@ -1554,6 +1566,8 @@ struct ext4_sb_info {
15541566
atomic_t s_bal_goals; /* goal hits */
15551567
atomic_t s_bal_breaks; /* too long searches */
15561568
atomic_t s_bal_2orders; /* 2^order hits */
1569+
atomic_t s_bal_cr0_bad_suggestions;
1570+
atomic_t s_bal_cr1_bad_suggestions;
15571571
atomic64_t s_bal_cX_groups_considered[4];
15581572
atomic64_t s_bal_cX_hits[4];
15591573
atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
@@ -3360,11 +3374,14 @@ struct ext4_group_info {
33603374
ext4_grpblk_t bb_free; /* total free blocks */
33613375
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
33623376
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
3377+
ext4_group_t bb_group; /* Group number */
33633378
struct list_head bb_prealloc_list;
33643379
#ifdef DOUBLE_CHECK
33653380
void *bb_bitmap;
33663381
#endif
33673382
struct rw_semaphore alloc_sem;
3383+
struct rb_node bb_avg_fragment_size_rb;
3384+
struct list_head bb_largest_free_order_node;
33683385
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
33693386
* regions, index is order.
33703387
* bb_counters[3] = 5 means

0 commit comments

Comments
 (0)