Skip to content

Commit 4eb7b93

Browse files
zhaohemakpm00
authored andcommitted
ocfs2: improve write IO performance when fragmentation is high
The group_search function ocfs2_cluster_group_search() should bypass groups with insufficient space to avoid unnecessary searches. This patch is particularly useful when ocfs2 is handling huge number small files, and volume fragmentation is very high. In this case, ocfs2 is busy with looking up available la window from //global_bitmap. This patch introduces a new member in the Group Description (gd) struct called 'bg_contig_free_bits', representing the max contigous free bits in this gd. When ocfs2 allocates a new la window from //global_bitmap, 'bg_contig_free_bits' helps expedite the search process. Let's image below path. 1. la state (->local_alloc_state) is set THROTTLED or DISABLED. 2. when user delete a large file and trigger ocfs2_local_alloc_seen_free_bits set osb->local_alloc_state unconditionally. 3. a write IOs thread run and trigger the worst performance path ``` ocfs2_reserve_clusters_with_limit ocfs2_reserve_local_alloc_bits ocfs2_local_alloc_slide_window //[1] + ocfs2_local_alloc_reserve_for_window //[2] + ocfs2_local_alloc_new_window //[3] ocfs2_recalc_la_window ``` [1]: will be called when la window bits used up. [2]: under la state is ENABLED, and this func only check global_bitmap free bits, it will succeed in general. [3]: will use the default la window size to search clusters then fail. ocfs2_recalc_la_window attempts other la window sizes. the timing complexity is O(n^4), resulting in a significant time cost for scanning global bitmap. This leads to a dramatic slowdown in write I/Os (e.g., user space 'dd'). i.e. an ocfs2 partition size: 1.45TB, cluster size: 4KB, la window default size: 106MB. The partition is fragmentation by creating & deleting huge mount of small files. before this patch, the timing of [3] should be (the number got from real world): - la window size change order (size: MB): 106, 53, 26.5, 13, 6.5, 3.25, 1.6, 0.8 only 0.8MB succeed, 0.8MB also triggers la window to disable. ocfs2_local_alloc_new_window retries 8 times, first 7 times totally runs in worst case. - group chain number: 242 ocfs2_claim_suballoc_bits calls for-loop 242 times - each chain has 49 block group ocfs2_search_chain calls while-loop 49 times - each bg has 32256 blocks ocfs2_block_group_find_clear_bits calls while-loop for 32256 bits. for ocfs2_find_next_zero_bit uses ffz() to find zero bit, let's use (32256/64) (this is not worst value) for timing calucation. the loop times: 7*242*49*(32256/64) = 41835024 (~42 million times) In the worst case, user space writes 1MB data will trigger 42M scanning times. under this patch, the timing is '7*242*49 = 83006', reduced by three orders of magnitude. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Heming Zhao <[email protected]> Reviewed-by: Joseph Qi <[email protected]> Cc: Changwei Ge <[email protected]> Cc: Gang He <[email protected]> Cc: Joel Becker <[email protected]> Cc: Jun Piao <[email protected]> Cc: Junxiao Bi <[email protected]> Cc: Mark Fasheh <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 6b839b3 commit 4eb7b93

File tree

5 files changed

+108
-11
lines changed

5 files changed

+108
-11
lines changed

fs/ocfs2/move_extents.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
685685
}
686686

687687
ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
688-
goal_bit, len);
688+
goal_bit, len, 0, 0);
689689
if (ret) {
690690
ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
691691
le16_to_cpu(gd->bg_chain));

fs/ocfs2/ocfs2_fs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -883,7 +883,8 @@ struct ocfs2_group_desc
883883
__le16 bg_free_bits_count; /* Free bits count */
884884
__le16 bg_chain; /* What chain I am in. */
885885
/*10*/ __le32 bg_generation;
886-
__le32 bg_reserved1;
886+
__le16 bg_contig_free_bits; /* max contig free bits length */
887+
__le16 bg_reserved1;
887888
__le64 bg_next_group; /* Next group in my list, in
888889
blocks */
889890
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in

fs/ocfs2/resize.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
9191
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
9292
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
9393
u16 old_bg_clusters;
94+
u16 contig_bits;
95+
__le16 old_bg_contig_free_bits;
9496

9597
trace_ocfs2_update_last_group_and_inode(new_clusters,
9698
first_new_cluster);
@@ -122,6 +124,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
122124
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
123125
}
124126

127+
contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap,
128+
le16_to_cpu(group->bg_bits), 0);
129+
old_bg_contig_free_bits = group->bg_contig_free_bits;
130+
group->bg_contig_free_bits = cpu_to_le16(contig_bits);
131+
125132
ocfs2_journal_dirty(handle, group_bh);
126133

127134
/* update the inode accordingly. */
@@ -160,6 +167,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
160167
le16_add_cpu(&group->bg_free_bits_count, backups);
161168
le16_add_cpu(&group->bg_bits, -1 * num_bits);
162169
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
170+
group->bg_contig_free_bits = old_bg_contig_free_bits;
163171
}
164172
out:
165173
if (ret)

fs/ocfs2/suballoc.c

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ struct ocfs2_suballoc_result {
5050
u64 sr_blkno; /* The first allocated block */
5151
unsigned int sr_bit_offset; /* The bit in the bg */
5252
unsigned int sr_bits; /* How many bits we claimed */
53+
unsigned int sr_max_contig_bits; /* The length for contiguous
54+
* free bits, only available
55+
* for cluster group
56+
*/
5357
};
5458

5559
static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
@@ -1272,6 +1276,26 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
12721276
return ret;
12731277
}
12741278

1279+
u16 ocfs2_find_max_contig_free_bits(void *bitmap,
1280+
u16 total_bits, u16 start)
1281+
{
1282+
u16 offset, free_bits;
1283+
u16 contig_bits = 0;
1284+
1285+
while (start < total_bits) {
1286+
offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
1287+
if (offset == total_bits)
1288+
break;
1289+
1290+
start = ocfs2_find_next_bit(bitmap, total_bits, offset);
1291+
free_bits = start - offset;
1292+
if (contig_bits < free_bits)
1293+
contig_bits = free_bits;
1294+
}
1295+
1296+
return contig_bits;
1297+
}
1298+
12751299
static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
12761300
struct buffer_head *bg_bh,
12771301
unsigned int bits_wanted,
@@ -1280,6 +1304,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
12801304
{
12811305
void *bitmap;
12821306
u16 best_offset, best_size;
1307+
u16 prev_best_size = 0;
12831308
int offset, start, found, status = 0;
12841309
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
12851310

@@ -1306,6 +1331,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
13061331
/* got a zero after some ones */
13071332
found = 1;
13081333
start = offset + 1;
1334+
prev_best_size = best_size;
13091335
}
13101336
if (found > best_size) {
13111337
best_size = found;
@@ -1318,6 +1344,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
13181344
}
13191345
}
13201346

1347+
/* best_size will be allocated, we save prev_best_size */
1348+
res->sr_max_contig_bits = prev_best_size;
13211349
if (best_size) {
13221350
res->sr_bit_offset = best_offset;
13231351
res->sr_bits = best_size;
@@ -1335,11 +1363,15 @@ int ocfs2_block_group_set_bits(handle_t *handle,
13351363
struct ocfs2_group_desc *bg,
13361364
struct buffer_head *group_bh,
13371365
unsigned int bit_off,
1338-
unsigned int num_bits)
1366+
unsigned int num_bits,
1367+
unsigned int max_contig_bits,
1368+
int fastpath)
13391369
{
13401370
int status;
13411371
void *bitmap = bg->bg_bitmap;
13421372
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1373+
unsigned int start = bit_off + num_bits;
1374+
u16 contig_bits;
13431375

13441376
/* All callers get the descriptor via
13451377
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -1371,6 +1403,28 @@ int ocfs2_block_group_set_bits(handle_t *handle,
13711403
while(num_bits--)
13721404
ocfs2_set_bit(bit_off++, bitmap);
13731405

1406+
/*
1407+
* this is optimize path, caller set old contig value
1408+
* in max_contig_bits to bypass finding action.
1409+
*/
1410+
if (fastpath) {
1411+
bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1412+
} else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
1413+
/*
1414+
* Usually, the block group bitmap allocates only 1 bit
1415+
* at a time, while the cluster group allocates n bits
1416+
* each time. Therefore, we only save the contig bits for
1417+
* the cluster group.
1418+
*/
1419+
contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
1420+
le16_to_cpu(bg->bg_bits), start);
1421+
if (contig_bits > max_contig_bits)
1422+
max_contig_bits = contig_bits;
1423+
bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1424+
} else {
1425+
bg->bg_contig_free_bits = 0;
1426+
}
1427+
13741428
ocfs2_journal_dirty(handle, group_bh);
13751429

13761430
bail:
@@ -1484,7 +1538,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
14841538

14851539
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
14861540

1487-
if (gd->bg_free_bits_count) {
1541+
if (le16_to_cpu(gd->bg_contig_free_bits) &&
1542+
le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
1543+
return -ENOSPC;
1544+
1545+
/* ->bg_contig_free_bits may un-initialized, so compare again */
1546+
if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
14881547
max_bits = le16_to_cpu(gd->bg_bits);
14891548

14901549
/* Tail groups in cluster bitmaps which aren't cpg
@@ -1553,7 +1612,7 @@ static int ocfs2_block_group_search(struct inode *inode,
15531612
BUG_ON(min_bits != 1);
15541613
BUG_ON(ocfs2_is_cluster_bitmap(inode));
15551614

1556-
if (bg->bg_free_bits_count) {
1615+
if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
15571616
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
15581617
group_bh, bits_wanted,
15591618
le16_to_cpu(bg->bg_bits),
@@ -1713,7 +1772,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
17131772
}
17141773

17151774
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1716-
res->sr_bit_offset, res->sr_bits);
1775+
res->sr_bit_offset, res->sr_bits,
1776+
res->sr_max_contig_bits, 0);
17171777
if (ret < 0) {
17181778
ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
17191779
res->sr_bits,
@@ -1847,7 +1907,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
18471907
bg,
18481908
group_bh,
18491909
res->sr_bit_offset,
1850-
res->sr_bits);
1910+
res->sr_bits,
1911+
res->sr_max_contig_bits,
1912+
0);
18511913
if (status < 0) {
18521914
ocfs2_rollback_alloc_dinode_counts(alloc_inode,
18531915
ac->ac_bh, res->sr_bits, chain);
@@ -2161,7 +2223,9 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
21612223
bg,
21622224
bg_bh,
21632225
res->sr_bit_offset,
2164-
res->sr_bits);
2226+
res->sr_bits,
2227+
res->sr_max_contig_bits,
2228+
0);
21652229
if (ret < 0) {
21662230
ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
21672231
ac->ac_bh, res->sr_bits, chain);
@@ -2380,11 +2444,13 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
23802444
struct buffer_head *group_bh,
23812445
unsigned int bit_off,
23822446
unsigned int num_bits,
2447+
unsigned int max_contig_bits,
23832448
void (*undo_fn)(unsigned int bit,
23842449
unsigned long *bmap))
23852450
{
23862451
int status;
23872452
unsigned int tmp;
2453+
u16 contig_bits;
23882454
struct ocfs2_group_desc *undo_bg = NULL;
23892455
struct journal_head *jh;
23902456

@@ -2431,6 +2497,20 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
24312497
num_bits);
24322498
}
24332499

2500+
/*
2501+
* TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
2502+
* we still need to rescan whole bitmap.
2503+
*/
2504+
if (ocfs2_is_cluster_bitmap(alloc_inode)) {
2505+
contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
2506+
le16_to_cpu(bg->bg_bits), 0);
2507+
if (contig_bits > max_contig_bits)
2508+
max_contig_bits = contig_bits;
2509+
bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
2510+
} else {
2511+
bg->bg_contig_free_bits = 0;
2512+
}
2513+
24342514
if (undo_fn)
24352515
spin_unlock(&jh->b_state_lock);
24362516

@@ -2457,6 +2537,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
24572537
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
24582538
struct buffer_head *group_bh = NULL;
24592539
struct ocfs2_group_desc *group;
2540+
__le16 old_bg_contig_free_bits = 0;
24602541

24612542
/* The alloc_bh comes from ocfs2_free_dinode() or
24622543
* ocfs2_free_clusters(). The callers have all locked the
@@ -2481,9 +2562,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
24812562

24822563
BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
24832564

2565+
if (ocfs2_is_cluster_bitmap(alloc_inode))
2566+
old_bg_contig_free_bits = group->bg_contig_free_bits;
24842567
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
24852568
group, group_bh,
2486-
start_bit, count, undo_fn);
2569+
start_bit, count, 0, undo_fn);
24872570
if (status < 0) {
24882571
mlog_errno(status);
24892572
goto bail;
@@ -2494,7 +2577,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
24942577
if (status < 0) {
24952578
mlog_errno(status);
24962579
ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2497-
start_bit, count);
2580+
start_bit, count,
2581+
le16_to_cpu(old_bg_contig_free_bits), 1);
24982582
goto bail;
24992583
}
25002584

fs/ocfs2/suballoc.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,16 @@ void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
7979
struct buffer_head *di_bh,
8080
u32 num_bits,
8181
u16 chain);
82+
u16 ocfs2_find_max_contig_free_bits(void *bitmap,
83+
u16 total_bits, u16 start);
8284
int ocfs2_block_group_set_bits(handle_t *handle,
8385
struct inode *alloc_inode,
8486
struct ocfs2_group_desc *bg,
8587
struct buffer_head *group_bh,
8688
unsigned int bit_off,
87-
unsigned int num_bits);
89+
unsigned int num_bits,
90+
unsigned int max_contig_bits,
91+
int fastpath);
8892

8993
int ocfs2_claim_metadata(handle_t *handle,
9094
struct ocfs2_alloc_context *ac,

0 commit comments

Comments
 (0)