Skip to content

Commit 81a046b

Browse files
committed
Merge tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "Features, highlights: - async discard - "mount -o discard=async" to enable it - freed extents are not discarded immediatelly, but grouped together and trimmed later, with IO rate limiting - the "sync" mode submits short extents that could have been ignored completely by the device, for SATA prior to 3.1 the requests are unqueued and have a big impact on performance - the actual discard IO requests have been moved out of transaction commit to a worker thread, improving commit latency - IO rate and request size can be tuned by sysfs files, for now enabled only with CONFIG_BTRFS_DEBUG as we might need to add/delete the files and don't have a stable-ish ABI for general use, defaults are conservative - export device state info in sysfs, eg. missing, writeable - no discard of extents known to be untouched on disk (eg. after reservation) - device stats reset is logged with process name and PID that called the ioctl Fixes: - fix missing hole after hole punching and fsync when using NO_HOLES - writeback: range cyclic mode could miss some dirty pages and lead to OOM - two more corner cases for metadata_uuid change after power loss during the change - fix infinite loop during fsync after mix of rename operations Core changes: - qgroup assign returns ENOTCONN when quotas not enabled, used to return EINVAL that was confusing - device closing does not need to allocate memory anymore - snapshot aware code got removed, disabled for years due to performance problems, reimplmentation will allow to select wheter defrag breaks or does not break COW on shared extents - tree-checker: - check leaf chunk item size, cross check against number of stripes - verify location keys for DIR_ITEM, DIR_INDEX and XATTR items - new self test for physical -> logical mapping code, used for super block range exclusion - assertion helpers/macros updated to avoid objtool "unreachable code" reports on older compilers or config option combinations" * tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (84 commits) btrfs: free block groups after free'ing fs trees btrfs: Fix split-brain handling when changing FSID to metadata uuid btrfs: Handle another split brain scenario with metadata uuid feature btrfs: Factor out metadata_uuid code from find_fsid. btrfs: Call find_fsid from find_fsid_inprogress Btrfs: fix infinite loop during fsync after rename operations btrfs: set trans->drity in btrfs_commit_transaction btrfs: drop log root for dropped roots btrfs: sysfs, add devid/dev_state kobject and device attributes btrfs: Refactor btrfs_rmap_block to improve readability btrfs: Add self-tests for btrfs_rmap_block btrfs: selftests: Add support for dummy devices btrfs: Move and unexport btrfs_rmap_block btrfs: separate definition of assertion failure handlers btrfs: device stats, log when stats are zeroed btrfs: fix improper setting of scanned for range cyclic write cache pages btrfs: safely advance counter when looking up bio csums btrfs: remove unused member btrfs_device::work btrfs: remove unnecessary wrapper get_alloc_profile btrfs: add correction to handle -1 edge case in async discard ...
2 parents 511fdb7 + 4e19443 commit 81a046b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3042
-1763
lines changed

fs/btrfs/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
1111
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
1212
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
1313
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
14-
block-rsv.o delalloc-space.o block-group.o
14+
block-rsv.o delalloc-space.o block-group.o discard.o
1515

1616
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
1717
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

fs/btrfs/block-group.c

Lines changed: 181 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#include "sysfs.h"
1515
#include "tree-log.h"
1616
#include "delalloc-space.h"
17+
#include "discard.h"
18+
#include "raid56.h"
1719

1820
/*
1921
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
9597
return extended_to_chunk(flags | allowed);
9698
}
9799

98-
static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
100+
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
99101
{
100102
unsigned seq;
101103
u64 flags;
@@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
115117
return btrfs_reduce_alloc_profile(fs_info, flags);
116118
}
117119

118-
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
119-
{
120-
return get_alloc_profile(fs_info, orig_flags);
121-
}
122-
123120
void btrfs_get_block_group(struct btrfs_block_group *cache)
124121
{
125122
atomic_inc(&cache->count);
@@ -131,6 +128,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
131128
WARN_ON(cache->pinned > 0);
132129
WARN_ON(cache->reserved > 0);
133130

131+
/*
132+
* A block_group shouldn't be on the discard_list anymore.
133+
* Remove the block_group from the discard_list to prevent us
134+
* from causing a panic due to NULL pointer dereference.
135+
*/
136+
if (WARN_ON(!list_empty(&cache->discard_list)))
137+
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
138+
cache);
139+
134140
/*
135141
* If not empty, someone is still holding mutex of
136142
* full_stripe_lock, which can only be released by caller.
@@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
466472
} else if (extent_start > start && extent_start < end) {
467473
size = extent_start - start;
468474
total_added += size;
469-
ret = btrfs_add_free_space(block_group, start,
470-
size);
475+
ret = btrfs_add_free_space_async_trimmed(block_group,
476+
start, size);
471477
BUG_ON(ret); /* -ENOMEM or logic error */
472478
start = extent_end + 1;
473479
} else {
@@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
478484
if (start < end) {
479485
size = end - start;
480486
total_added += size;
481-
ret = btrfs_add_free_space(block_group, start, size);
487+
ret = btrfs_add_free_space_async_trimmed(block_group, start,
488+
size);
482489
BUG_ON(ret); /* -ENOMEM or logic error */
483490
}
484491

@@ -1185,21 +1192,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
11851192
struct btrfs_space_info *sinfo = cache->space_info;
11861193
u64 num_bytes;
11871194
u64 sinfo_used;
1188-
u64 min_allocable_bytes;
11891195
int ret = -ENOSPC;
11901196

1191-
/*
1192-
* We need some metadata space and system metadata space for
1193-
* allocating chunks in some corner cases until we force to set
1194-
* it to be readonly.
1195-
*/
1196-
if ((sinfo->flags &
1197-
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
1198-
!force)
1199-
min_allocable_bytes = SZ_1M;
1200-
else
1201-
min_allocable_bytes = 0;
1202-
12031197
spin_lock(&sinfo->lock);
12041198
spin_lock(&cache->lock);
12051199

@@ -1217,10 +1211,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
12171211
* sinfo_used + num_bytes should always <= sinfo->total_bytes.
12181212
*
12191213
* Here we make sure if we mark this bg RO, we still have enough
1220-
* free space as buffer (if min_allocable_bytes is not 0).
1214+
* free space as buffer.
12211215
*/
1222-
if (sinfo_used + num_bytes + min_allocable_bytes <=
1223-
sinfo->total_bytes) {
1216+
if (sinfo_used + num_bytes <= sinfo->total_bytes) {
12241217
sinfo->bytes_readonly += num_bytes;
12251218
cache->ro++;
12261219
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -1233,8 +1226,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
12331226
btrfs_info(cache->fs_info,
12341227
"unable to make block group %llu ro", cache->start);
12351228
btrfs_info(cache->fs_info,
1236-
"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
1237-
sinfo_used, num_bytes, min_allocable_bytes);
1229+
"sinfo_used=%llu bg_num_bytes=%llu",
1230+
sinfo_used, num_bytes);
12381231
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
12391232
}
12401233
return ret;
@@ -1249,6 +1242,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
12491242
struct btrfs_block_group *block_group;
12501243
struct btrfs_space_info *space_info;
12511244
struct btrfs_trans_handle *trans;
1245+
const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
12521246
int ret = 0;
12531247

12541248
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1272,10 +1266,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
12721266
}
12731267
spin_unlock(&fs_info->unused_bgs_lock);
12741268

1269+
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1270+
12751271
mutex_lock(&fs_info->delete_unused_bgs_mutex);
12761272

12771273
/* Don't want to race with allocators so take the groups_sem */
12781274
down_write(&space_info->groups_sem);
1275+
1276+
/*
1277+
* Async discard moves the final block group discard to be prior
1278+
* to the unused_bgs code path. Therefore, if it's not fully
1279+
* trimmed, punt it back to the async discard lists.
1280+
*/
1281+
if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1282+
!btrfs_is_free_space_trimmed(block_group)) {
1283+
trace_btrfs_skip_unused_block_group(block_group);
1284+
up_write(&space_info->groups_sem);
1285+
/* Requeue if we failed because of async discard */
1286+
btrfs_discard_queue_work(&fs_info->discard_ctl,
1287+
block_group);
1288+
goto next;
1289+
}
1290+
12791291
spin_lock(&block_group->lock);
12801292
if (block_group->reserved || block_group->pinned ||
12811293
block_group->used || block_group->ro ||
@@ -1347,6 +1359,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
13471359
}
13481360
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
13491361

1362+
/*
1363+
* At this point, the block_group is read only and should fail
1364+
* new allocations. However, btrfs_finish_extent_commit() can
1365+
* cause this block_group to be placed back on the discard
1366+
* lists because now the block_group isn't fully discarded.
1367+
* Bail here and try again later after discarding everything.
1368+
*/
1369+
spin_lock(&fs_info->discard_ctl.lock);
1370+
if (!list_empty(&block_group->discard_list)) {
1371+
spin_unlock(&fs_info->discard_ctl.lock);
1372+
btrfs_dec_block_group_ro(block_group);
1373+
btrfs_discard_queue_work(&fs_info->discard_ctl,
1374+
block_group);
1375+
goto end_trans;
1376+
}
1377+
spin_unlock(&fs_info->discard_ctl.lock);
1378+
13501379
/* Reset pinned so btrfs_put_block_group doesn't complain */
13511380
spin_lock(&space_info->lock);
13521381
spin_lock(&block_group->lock);
@@ -1362,8 +1391,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
13621391
spin_unlock(&block_group->lock);
13631392
spin_unlock(&space_info->lock);
13641393

1394+
/*
1395+
* The normal path here is an unused block group is passed here,
1396+
* then trimming is handled in the transaction commit path.
1397+
* Async discard interposes before this to do the trimming
1398+
* before coming down the unused block group path as trimming
1399+
* will no longer be done later in the transaction commit path.
1400+
*/
1401+
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1402+
goto flip_async;
1403+
13651404
/* DISCARD can flip during remount */
1366-
trimming = btrfs_test_opt(fs_info, DISCARD);
1405+
trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
13671406

13681407
/* Implicit trim during transaction commit. */
13691408
if (trimming)
@@ -1406,6 +1445,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
14061445
spin_lock(&fs_info->unused_bgs_lock);
14071446
}
14081447
spin_unlock(&fs_info->unused_bgs_lock);
1448+
return;
1449+
1450+
flip_async:
1451+
btrfs_end_transaction(trans);
1452+
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1453+
btrfs_put_block_group(block_group);
1454+
btrfs_discard_punt_unused_bgs_list(fs_info);
14091455
}
14101456

14111457
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@@ -1516,6 +1562,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
15161562
write_sequnlock(&fs_info->profiles_lock);
15171563
}
15181564

1565+
/**
1566+
* btrfs_rmap_block - Map a physical disk address to a list of logical addresses
1567+
* @chunk_start: logical address of block group
1568+
* @physical: physical address to map to logical addresses
1569+
* @logical: return array of logical addresses which map to @physical
1570+
* @naddrs: length of @logical
1571+
* @stripe_len: size of IO stripe for the given block group
1572+
*
1573+
* Maps a particular @physical disk address to a list of @logical addresses.
1574+
* Used primarily to exclude those portions of a block group that contain super
1575+
* block copies.
1576+
*/
1577+
EXPORT_FOR_TESTS
1578+
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1579+
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
1580+
{
1581+
struct extent_map *em;
1582+
struct map_lookup *map;
1583+
u64 *buf;
1584+
u64 bytenr;
1585+
u64 data_stripe_length;
1586+
u64 io_stripe_size;
1587+
int i, nr = 0;
1588+
int ret = 0;
1589+
1590+
em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1591+
if (IS_ERR(em))
1592+
return -EIO;
1593+
1594+
map = em->map_lookup;
1595+
data_stripe_length = em->len;
1596+
io_stripe_size = map->stripe_len;
1597+
1598+
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
1599+
data_stripe_length = div_u64(data_stripe_length,
1600+
map->num_stripes / map->sub_stripes);
1601+
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
1602+
data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
1603+
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1604+
data_stripe_length = div_u64(data_stripe_length,
1605+
nr_data_stripes(map));
1606+
io_stripe_size = map->stripe_len * nr_data_stripes(map);
1607+
}
1608+
1609+
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1610+
if (!buf) {
1611+
ret = -ENOMEM;
1612+
goto out;
1613+
}
1614+
1615+
for (i = 0; i < map->num_stripes; i++) {
1616+
bool already_inserted = false;
1617+
u64 stripe_nr;
1618+
int j;
1619+
1620+
if (!in_range(physical, map->stripes[i].physical,
1621+
data_stripe_length))
1622+
continue;
1623+
1624+
stripe_nr = physical - map->stripes[i].physical;
1625+
stripe_nr = div64_u64(stripe_nr, map->stripe_len);
1626+
1627+
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1628+
stripe_nr = stripe_nr * map->num_stripes + i;
1629+
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1630+
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1631+
stripe_nr = stripe_nr * map->num_stripes + i;
1632+
}
1633+
/*
1634+
* The remaining case would be for RAID56, multiply by
1635+
* nr_data_stripes(). Alternatively, just use rmap_len below
1636+
* instead of map->stripe_len
1637+
*/
1638+
1639+
bytenr = chunk_start + stripe_nr * io_stripe_size;
1640+
1641+
/* Ensure we don't add duplicate addresses */
1642+
for (j = 0; j < nr; j++) {
1643+
if (buf[j] == bytenr) {
1644+
already_inserted = true;
1645+
break;
1646+
}
1647+
}
1648+
1649+
if (!already_inserted)
1650+
buf[nr++] = bytenr;
1651+
}
1652+
1653+
*logical = buf;
1654+
*naddrs = nr;
1655+
*stripe_len = io_stripe_size;
1656+
out:
1657+
free_extent_map(em);
1658+
return ret;
1659+
}
1660+
15191661
static int exclude_super_stripes(struct btrfs_block_group *cache)
15201662
{
15211663
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1610,13 +1752,16 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
16101752
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
16111753
set_free_space_tree_thresholds(cache);
16121754

1755+
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1756+
16131757
atomic_set(&cache->count, 1);
16141758
spin_lock_init(&cache->lock);
16151759
init_rwsem(&cache->data_rwsem);
16161760
INIT_LIST_HEAD(&cache->list);
16171761
INIT_LIST_HEAD(&cache->cluster_list);
16181762
INIT_LIST_HEAD(&cache->bg_list);
16191763
INIT_LIST_HEAD(&cache->ro_list);
1764+
INIT_LIST_HEAD(&cache->discard_list);
16201765
INIT_LIST_HEAD(&cache->dirty_list);
16211766
INIT_LIST_HEAD(&cache->io_list);
16221767
btrfs_init_free_space_ctl(cache);
@@ -1775,7 +1920,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
17751920
inc_block_group_ro(cache, 1);
17761921
} else if (cache->used == 0) {
17771922
ASSERT(list_empty(&cache->bg_list));
1778-
btrfs_mark_bg_unused(cache);
1923+
if (btrfs_test_opt(info, DISCARD_ASYNC))
1924+
btrfs_discard_queue_work(&info->discard_ctl, cache);
1925+
else
1926+
btrfs_mark_bg_unused(cache);
17791927
}
17801928
return 0;
17811929
error:
@@ -2738,8 +2886,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
27382886
* dirty list to avoid races between cleaner kthread and space
27392887
* cache writeout.
27402888
*/
2741-
if (!alloc && old_val == 0)
2742-
btrfs_mark_bg_unused(cache);
2889+
if (!alloc && old_val == 0) {
2890+
if (!btrfs_test_opt(info, DISCARD_ASYNC))
2891+
btrfs_mark_bg_unused(cache);
2892+
}
27432893

27442894
btrfs_put_block_group(cache);
27452895
total -= num_bytes;

0 commit comments

Comments
 (0)