Skip to content

Commit 5191290

Browse files
committed
Merge tag 'for-5.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "This contains feature updates, performance improvements, preparatory and core work and some related VFS updates: Features: - encoded read/write ioctls, allows user space to read or write raw data directly to extents (now compressed, encrypted in the future), will be used by send/receive v2 where it saves processing time - zoned mode now works with metadata DUP (the mkfs.btrfs default) - error message header updates: - print error state: transaction abort, other error, log tree errors - print transient filesystem state: remount, device replace, ignored checksum verifications - tree-checker: verify the transaction id of the to-be-written dirty extent buffer Performance improvements for fsync: - directory logging speedups (up to -90% run time) - avoid logging all directory changes during renames (up to -60% run time) - avoid inode logging during rename and link when possible (up to -60% run time) - prepare extents to be logged before locking a log tree path (throughput +7%) - stop copying old file extents when doing a full fsync() - improved logging of old extents after truncate Core, fixes: - improved stale device identification by dev_t and not just path (for devices that are behind other layers like device mapper) - continued extent tree v2 preparatory work - disable features that won't work yet - add wrappers and abstractions for new tree roots - improved error handling - add super block write annotations around background block group reclaim - fix device scanning messages potentially accessing stale pointer - cleanups and refactoring VFS: - allow reflinks/deduplication from two different mounts of the same filesystem - export and add helpers for read/write range verification, for the encoded ioctls" * tag 'for-5.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (98 commits) btrfs: zoned: put block group after final usage btrfs: don't access possibly stale fs_info data in device_list_add btrfs: add lockdep_assert_held to need_preemptive_reclaim btrfs: verify the tranisd of the to-be-written dirty extent buffer btrfs: unify the error handling of btrfs_read_buffer() btrfs: unify the error handling pattern for read_tree_block() btrfs: factor out do_free_extent_accounting helper btrfs: remove last_ref from the extent freeing code btrfs: add a alloc_reserved_extent helper btrfs: remove BUG_ON(ret) in alloc_reserved_tree_block btrfs: add and use helper for unlinking inode during log replay btrfs: extend locking to all space_info members accesses btrfs: zoned: mark relocation as writing fs: allow cross-vfsmount reflink/dedupe btrfs: remove the cross file system checks from remap btrfs: pass btrfs_fs_info to btrfs_recover_relocation btrfs: pass btrfs_fs_info for deleting snapshots and cleaner btrfs: add filesystems state details to error messages btrfs: deal with unexpected extent type during reflinking btrfs: fix unexpected error path when reflinking an inline extent ...
2 parents 9b03992 + d3e2996 commit 5191290

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+3109
-1331
lines changed

fs/btrfs/backref.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
789789
if (IS_ERR(eb)) {
790790
free_pref(ref);
791791
return PTR_ERR(eb);
792-
} else if (!extent_buffer_uptodate(eb)) {
792+
}
793+
if (!extent_buffer_uptodate(eb)) {
793794
free_pref(ref);
794795
free_extent_buffer(eb);
795796
return -EIO;
796797
}
798+
797799
if (lock)
798800
btrfs_tree_read_lock(eb);
799801
if (btrfs_header_level(eb) == 0)
@@ -1335,7 +1337,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
13351337
if (IS_ERR(eb)) {
13361338
ret = PTR_ERR(eb);
13371339
goto out;
1338-
} else if (!extent_buffer_uptodate(eb)) {
1340+
}
1341+
if (!extent_buffer_uptodate(eb)) {
13391342
free_extent_buffer(eb);
13401343
ret = -EIO;
13411344
goto out;

fs/btrfs/block-group.c

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,15 +1522,20 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
15221522
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
15231523
return;
15241524

1525-
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
1525+
sb_start_write(fs_info->sb);
1526+
1527+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1528+
sb_end_write(fs_info->sb);
15261529
return;
1530+
}
15271531

15281532
/*
15291533
* Long running balances can keep us blocked here for eternity, so
15301534
* simply skip reclaim if we're unable to get the mutex.
15311535
*/
15321536
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
15331537
btrfs_exclop_finish(fs_info);
1538+
sb_end_write(fs_info->sb);
15341539
return;
15351540
}
15361541

@@ -1605,6 +1610,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
16051610
spin_unlock(&fs_info->unused_bgs_lock);
16061611
mutex_unlock(&fs_info->reclaim_bgs_lock);
16071612
btrfs_exclop_finish(fs_info);
1613+
sb_end_write(fs_info->sb);
16081614
}
16091615

16101616
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -2006,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
20062012
cache->length = key->offset;
20072013
cache->used = btrfs_stack_block_group_used(bgi);
20082014
cache->flags = btrfs_stack_block_group_flags(bgi);
2015+
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
20092016

20102017
set_free_space_tree_thresholds(cache);
20112018

@@ -2288,7 +2295,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
22882295
spin_lock(&block_group->lock);
22892296
btrfs_set_stack_block_group_used(&bgi, block_group->used);
22902297
btrfs_set_stack_block_group_chunk_objectid(&bgi,
2291-
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2298+
block_group->global_root_id);
22922299
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
22932300
key.objectid = block_group->start;
22942301
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2444,6 +2451,27 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
24442451
btrfs_trans_release_chunk_metadata(trans);
24452452
}
24462453

2454+
/*
2455+
* For extent tree v2 we use the block_group_item->chunk_offset to point at our
2456+
* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2457+
*/
2458+
static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
2459+
{
2460+
u64 div = SZ_1G;
2461+
u64 index;
2462+
2463+
if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2464+
return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2465+
2466+
/* If we have a smaller fs index based on 128MiB. */
2467+
if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2468+
div = SZ_128M;
2469+
2470+
offset = div64_u64(offset, div);
2471+
div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2472+
return index;
2473+
}
2474+
24472475
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
24482476
u64 bytes_used, u64 type,
24492477
u64 chunk_offset, u64 size)
@@ -2464,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
24642492
cache->flags = type;
24652493
cache->last_byte_to_unpin = (u64)-1;
24662494
cache->cached = BTRFS_CACHE_FINISHED;
2495+
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2496+
24672497
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
24682498
cache->needs_free_space = 1;
24692499

@@ -2693,7 +2723,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
26932723
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
26942724
btrfs_set_stack_block_group_used(&bgi, cache->used);
26952725
btrfs_set_stack_block_group_chunk_objectid(&bgi,
2696-
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2726+
cache->global_root_id);
26972727
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
26982728
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
26992729
btrfs_mark_buffer_dirty(leaf);

fs/btrfs/block-group.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ struct btrfs_block_group {
6868
u64 bytes_super;
6969
u64 flags;
7070
u64 cache_generation;
71+
u64 global_root_id;
7172

7273
/*
7374
* If the free space extent count exceeds this number, convert the block

fs/btrfs/btrfs_inode.h

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
#include "ordered-data.h"
1414
#include "delayed-inode.h"
1515

16+
/*
17+
* Since we search a directory based on f_pos (struct dir_context::pos) we have
18+
* to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
19+
* everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
20+
*/
21+
#define BTRFS_DIR_START_INDEX 2
22+
1623
/*
1724
* ordered_data_close is set by truncate when a file that used
1825
* to have good data has been truncated to zero. When it is set
@@ -173,8 +180,9 @@ struct btrfs_inode {
173180
u64 disk_i_size;
174181

175182
/*
176-
* if this is a directory then index_cnt is the counter for the index
177-
* number for new files that are created
183+
* If this is a directory then index_cnt is the counter for the index
184+
* number for new files that are created. For an empty directory, this
185+
* must be initialized to BTRFS_DIR_START_INDEX.
178186
*/
179187
u64 index_cnt;
180188

@@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
333341
spin_unlock(&inode->lock);
334342
}
335343

344+
/*
345+
* Should be called while holding the inode's VFS lock in exclusive mode or in a
346+
* context where no one else can access the inode concurrently (during inode
347+
* creation or when loading an inode from disk).
348+
*/
349+
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
350+
{
351+
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
352+
/*
353+
* The inode may have been part of a reflink operation in the last
354+
* transaction that modified it, and then a fsync has reset the
355+
* last_reflink_trans to avoid subsequent fsyncs in the same
356+
* transaction to do unnecessary work. So update last_reflink_trans
357+
* to the last_trans value (we have to be pessimistic and assume a
358+
* reflink happened).
359+
*
360+
* The ->last_trans is protected by the inode's spinlock and we can
361+
* have a concurrent ordered extent completion update it. Also set
362+
* last_reflink_trans to ->last_trans only if the former is less than
363+
* the later, because we can be called in a context where
364+
* last_reflink_trans was set to the current transaction generation
365+
* while ->last_trans was not yet updated in the current transaction,
366+
* and therefore has a lower value.
367+
*/
368+
spin_lock(&inode->lock);
369+
if (inode->last_reflink_trans < inode->last_trans)
370+
inode->last_reflink_trans = inode->last_trans;
371+
spin_unlock(&inode->lock);
372+
}
373+
336374
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
337375
{
338376
bool ret = false;

fs/btrfs/compression.c

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
219219
bi_size += bvec->bv_len;
220220

221221
if (bio->bi_status)
222-
cb->errors = 1;
222+
cb->status = bio->bi_status;
223223

224224
ASSERT(bi_size && bi_size <= cb->compressed_len);
225225
last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
@@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
234234
return last_io;
235235
}
236236

237-
static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
237+
static void finish_compressed_bio_read(struct compressed_bio *cb)
238238
{
239239
unsigned int index;
240240
struct page *page;
@@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi
247247
}
248248

249249
/* Do io completion on the original bio */
250-
if (cb->errors) {
251-
bio_io_error(cb->orig_bio);
250+
if (cb->status != BLK_STS_OK) {
251+
cb->orig_bio->bi_status = cb->status;
252+
bio_endio(cb->orig_bio);
252253
} else {
253254
struct bio_vec *bvec;
254255
struct bvec_iter_all iter_all;
255256

256-
ASSERT(bio);
257-
ASSERT(!bio->bi_status);
258257
/*
259258
* We have verified the checksum already, set page checked so
260259
* the end_io handlers know about it
261260
*/
262-
ASSERT(!bio_flagged(bio, BIO_CLONED));
261+
ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
263262
bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
264263
u64 bvec_start = page_offset(bvec->bv_page) +
265264
bvec->bv_offset;
@@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio)
308307
* Some IO in this cb have failed, just skip checksum as there
309308
* is no way it could be correct.
310309
*/
311-
if (cb->errors == 1)
310+
if (cb->status != BLK_STS_OK)
312311
goto csum_failed;
313312

314313
inode = cb->inode;
@@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio)
324323

325324
csum_failed:
326325
if (ret)
327-
cb->errors = 1;
328-
finish_compressed_bio_read(cb, bio);
326+
cb->status = errno_to_blk_status(ret);
327+
finish_compressed_bio_read(cb);
329328
out:
330329
bio_put(bio);
331330
}
@@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode,
342341
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
343342
struct page *pages[16];
344343
unsigned long nr_pages = end_index - index + 1;
344+
const int errno = blk_status_to_errno(cb->status);
345345
int i;
346346
int ret;
347347

348-
if (cb->errors)
349-
mapping_set_error(inode->i_mapping, -EIO);
348+
if (errno)
349+
mapping_set_error(inode->i_mapping, errno);
350350

351351
while (nr_pages > 0) {
352352
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
358358
continue;
359359
}
360360
for (i = 0; i < ret; i++) {
361-
if (cb->errors)
361+
if (errno)
362362
SetPageError(pages[i]);
363363
btrfs_page_clamp_clear_writeback(fs_info, pages[i],
364364
cb->start, cb->len);
@@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
381381
*/
382382
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
383383
cb->start, cb->start + cb->len - 1,
384-
!cb->errors);
384+
cb->status == BLK_STS_OK);
385385

386-
end_compressed_writeback(inode, cb);
386+
if (cb->writeback)
387+
end_compressed_writeback(inode, cb);
387388
/* Note, our inode could be gone now */
388389

389390
/*
@@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
506507
struct page **compressed_pages,
507508
unsigned int nr_pages,
508509
unsigned int write_flags,
509-
struct cgroup_subsys_state *blkcg_css)
510+
struct cgroup_subsys_state *blkcg_css,
511+
bool writeback)
510512
{
511513
struct btrfs_fs_info *fs_info = inode->root->fs_info;
512514
struct bio *bio = NULL;
@@ -524,13 +526,14 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
524526
if (!cb)
525527
return BLK_STS_RESOURCE;
526528
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
527-
cb->errors = 0;
529+
cb->status = BLK_STS_OK;
528530
cb->inode = &inode->vfs_inode;
529531
cb->start = start;
530532
cb->len = len;
531533
cb->mirror_num = 0;
532534
cb->compressed_pages = compressed_pages;
533535
cb->compressed_len = compressed_len;
536+
cb->writeback = writeback;
534537
cb->orig_bio = NULL;
535538
cb->nr_pages = nr_pages;
536539

@@ -591,7 +594,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
591594

592595
if (submit) {
593596
if (!skip_sum) {
594-
ret = btrfs_csum_one_bio(inode, bio, start, 1);
597+
ret = btrfs_csum_one_bio(inode, bio, start, true);
595598
if (ret)
596599
goto finish_cb;
597600
}
@@ -808,7 +811,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
808811
u64 em_len;
809812
u64 em_start;
810813
struct extent_map *em;
811-
blk_status_t ret = BLK_STS_RESOURCE;
814+
blk_status_t ret;
812815
int faili = 0;
813816
u8 *sums;
814817

@@ -821,17 +824,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
821824
read_lock(&em_tree->lock);
822825
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
823826
read_unlock(&em_tree->lock);
824-
if (!em)
825-
return BLK_STS_IOERR;
827+
if (!em) {
828+
ret = BLK_STS_IOERR;
829+
goto out;
830+
}
826831

827832
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
828833
compressed_len = em->block_len;
829834
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
830-
if (!cb)
835+
if (!cb) {
836+
ret = BLK_STS_RESOURCE;
831837
goto out;
838+
}
832839

833840
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
834-
cb->errors = 0;
841+
cb->status = BLK_STS_OK;
835842
cb->inode = inode;
836843
cb->mirror_num = mirror_num;
837844
sums = cb->sums;
@@ -851,8 +858,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
851858
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
852859
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
853860
GFP_NOFS);
854-
if (!cb->compressed_pages)
861+
if (!cb->compressed_pages) {
862+
ret = BLK_STS_RESOURCE;
855863
goto fail1;
864+
}
856865

857866
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
858867
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
@@ -938,7 +947,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
938947
comp_bio = NULL;
939948
}
940949
}
941-
return 0;
950+
return BLK_STS_OK;
942951

943952
fail2:
944953
while (faili >= 0) {
@@ -951,6 +960,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
951960
kfree(cb);
952961
out:
953962
free_extent_map(em);
963+
bio->bi_status = ret;
964+
bio_endio(bio);
954965
return ret;
955966
finish_cb:
956967
if (comp_bio) {
@@ -970,7 +981,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
970981
*/
971982
ASSERT(refcount_read(&cb->pending_sectors));
972983
/* Now we are the only one referring @cb, can finish it safely. */
973-
finish_compressed_bio_read(cb, NULL);
984+
finish_compressed_bio_read(cb);
974985
return ret;
975986
}
976987

0 commit comments

Comments
 (0)