Skip to content

Commit fa4b8cb

Browse files
fdmananakdave
authored andcommitted
btrfs: avoid iterating over all indexes when logging directory
When logging a directory, after copying all directory index items from the subvolume tree to the log tree, we iterate over the subvolume tree to find all dir index items that are located in leaves COWed (or created) in the current transaction. If we keep logging a directory several times during the same transaction, we end up iterating over the same dir index items everytime we log the directory, wasting time and adding extra lock contention on the subvolume tree. So just keep track of the last logged dir index offset in order to start the search for that index (+1) the next time the directory is logged, as dir index values (key offsets) come from a monotonically increasing counter. The following test measures the difference before and after this change: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT # Time values in milliseconds. declare -a fsync_times # Total number of files added to the test directory. num_files=1000000 # Fsync directory after every N files are added. fsync_period=100 mkdir $MNT/testdir fsync_total_time=0 for ((i = 1; i <= $num_files; i++)); do echo -n > $MNT/testdir/file_$i if [ $((i % fsync_period)) -eq 0 ]; then start=$(date +%s%N) xfs_io -c "fsync" $MNT/testdir end=$(date +%s%N) fsync_total_time=$((fsync_total_time + (end - start))) fsync_times[i]=$(( (end - start) / 1000000 )) echo -n -e "Progress $i / $num_files\r" fi done echo -e "\nHistogram of directory fsync duration in ms:\n" printf '%s\n' "${fsync_times[@]}" | \ perl -MStatistics::Histogram -e '@d = <>; print get_histogram(\@d);' fsync_total_time=$((fsync_total_time / 1000000)) echo -e "\nTotal time spent in fsync: $fsync_total_time ms\n" echo umount $MNT The test was run on a non-debug kernel (Debian's default kernel config) against a 15G null block device. Result before this change: Histogram of directory fsync duration in ms: Count: 10000 Range: 3.000 - 362.000; Mean: 34.556; Median: 31.000; Stddev: 25.751 Percentiles: 90th: 71.000; 95th: 77.000; 99th: 81.000 3.000 - 5.278: 1423 ################################# 5.278 - 8.854: 1173 ########################### 8.854 - 14.467: 591 ############## 14.467 - 23.277: 1025 ####################### 23.277 - 37.105: 1422 ################################# 37.105 - 58.809: 2036 ############################################### 58.809 - 92.876: 2316 ##################################################### 92.876 - 146.346: 6 | 146.346 - 230.271: 6 | 230.271 - 362.000: 2 | Total time spent in fsync: 350527 ms Result after this change: Histogram of directory fsync duration in ms: Count: 10000 Range: 3.000 - 1088.000; Mean: 8.704; Median: 8.000; Stddev: 12.576 Percentiles: 90th: 12.000; 95th: 14.000; 99th: 17.000 3.000 - 6.007: 3222 ################################# 6.007 - 11.276: 5197 ##################################################### 11.276 - 20.506: 1551 ################ 20.506 - 36.674: 24 | 36.674 - 201.552: 1 | 201.552 - 353.841: 4 | 353.841 - 1088.000: 1 | Total time spent in fsync: 92114 ms Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 8eb3dd1 commit fa4b8cb

File tree

2 files changed

+57
-7
lines changed

2 files changed

+57
-7
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,22 @@ struct btrfs_inode {
142142
/* a local copy of root's last_log_commit */
143143
int last_log_commit;
144144

145-
/*
146-
* Total number of bytes pending delalloc, used by stat to calculate the
147-
* real block usage of the file. This is used only for files.
148-
*/
149-
u64 delalloc_bytes;
145+
union {
146+
/*
147+
* Total number of bytes pending delalloc, used by stat to
148+
* calculate the real block usage of the file. This is used
149+
* only for files.
150+
*/
151+
u64 delalloc_bytes;
152+
/*
153+
* The lowest possible index of the next dir index key which
154+
* points to an inode that needs to be logged.
155+
* This is used only for directories.
156+
* Use the helpers btrfs_get_first_dir_index_to_log() and
157+
* btrfs_set_first_dir_index_to_log() to access this field.
158+
*/
159+
u64 first_dir_index_to_log;
160+
};
150161

151162
union {
152163
/*
@@ -247,6 +258,17 @@ struct btrfs_inode {
247258
struct inode vfs_inode;
248259
};
249260

261+
static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode)
262+
{
263+
return READ_ONCE(inode->first_dir_index_to_log);
264+
}
265+
266+
static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
267+
u64 index)
268+
{
269+
WRITE_ONCE(inode->first_dir_index_to_log, index);
270+
}
271+
250272
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
251273
{
252274
return container_of(inode, struct btrfs_inode, vfs_inode);

fs/btrfs/tree-log.c

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3618,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
36183618
ret = BTRFS_LOG_FORCE_COMMIT;
36193619
else
36203620
inode->last_dir_index_offset = last_index;
3621+
3622+
if (btrfs_get_first_dir_index_to_log(inode) == 0)
3623+
btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
36213624
out:
36223625
kfree(ins_data);
36233626

@@ -5376,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
53765379
LIST_HEAD(dir_list);
53775380
struct btrfs_dir_list *dir_elem;
53785381
u64 ino = btrfs_ino(start_inode);
5382+
struct btrfs_inode *curr_inode = start_inode;
53795383
int ret = 0;
53805384

53815385
/*
@@ -5390,18 +5394,23 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
53905394
if (!path)
53915395
return -ENOMEM;
53925396

5397+
/* Pairs with btrfs_add_delayed_iput below. */
5398+
ihold(&curr_inode->vfs_inode);
5399+
53935400
while (true) {
5401+
struct inode *vfs_inode;
53945402
struct extent_buffer *leaf;
53955403
struct btrfs_key min_key;
5404+
u64 next_index;
53965405
bool continue_curr_inode = true;
53975406
int nritems;
53985407
int i;
53995408

54005409
min_key.objectid = ino;
54015410
min_key.type = BTRFS_DIR_INDEX_KEY;
5402-
min_key.offset = 0;
5411+
min_key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5412+
next_index = min_key.offset;
54035413
again:
5404-
btrfs_release_path(path);
54055414
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
54065415
if (ret < 0) {
54075416
break;
@@ -5426,6 +5435,8 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
54265435
break;
54275436
}
54285437

5438+
next_index = min_key.offset + 1;
5439+
54295440
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
54305441
type = btrfs_dir_ftype(leaf, di);
54315442
if (btrfs_dir_transid(leaf, di) < trans->transid)
@@ -5466,22 +5477,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
54665477
break;
54675478
}
54685479

5480+
btrfs_release_path(path);
5481+
54695482
if (continue_curr_inode && min_key.offset < (u64)-1) {
54705483
min_key.offset++;
54715484
goto again;
54725485
}
54735486

54745487
next:
5488+
btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5489+
54755490
if (list_empty(&dir_list))
54765491
break;
54775492

54785493
dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
54795494
ino = dir_elem->ino;
54805495
list_del(&dir_elem->list);
54815496
kfree(dir_elem);
5497+
5498+
btrfs_add_delayed_iput(curr_inode);
5499+
curr_inode = NULL;
5500+
5501+
vfs_inode = btrfs_iget(fs_info->sb, ino, root);
5502+
if (IS_ERR(vfs_inode)) {
5503+
ret = PTR_ERR(vfs_inode);
5504+
break;
5505+
}
5506+
curr_inode = BTRFS_I(vfs_inode);
54825507
}
54835508
out:
54845509
btrfs_free_path(path);
5510+
if (curr_inode)
5511+
btrfs_add_delayed_iput(curr_inode);
5512+
54855513
if (ret) {
54865514
struct btrfs_dir_list *next;
54875515

0 commit comments

Comments
 (0)