Skip to content

Commit 57b6f58

Browse files
committed
Merge remote-tracking branch 'remotes/hreitz/tags/pull-block-2021-09-15' into staging
Block patches: - Block-status cache for data regions - qcow2 optimization (when using subclusters) - iotests delinting, and let 297 (lint checker) cover named iotests - qcow2 check improvements - Added -F (target backing file format) option to qemu-img convert - Mirror job fix - Fix for when a migration is initiated while a backup job runs - Fix for uncached qemu-img convert to a volume with 4k sectors (for an unaligned image) - Minor gluster driver fix # gpg: Signature made Wed 15 Sep 2021 18:39:11 BST # gpg: using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF # gpg: issuer "[email protected]" # gpg: Good signature from "Hanna Reitz <[email protected]>" [marginal] # gpg: WARNING: This key is not certified with sufficiently trusted signatures! # gpg: It is not certain that the signature belongs to the owner. # Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00 4D34 A1FA 40D0 9801 9CDF * remotes/hreitz/tags/pull-block-2021-09-15: (32 commits) qemu-img: Add -F shorthand to convert qcow2-refcount: check_refblocks(): add separate message for reserved qcow2-refcount: check_refcounts_l1(): check reserved bits qcow2-refcount: improve style of check_refcounts_l1() qcow2-refcount: check_refcounts_l2(): check reserved bits qcow2-refcount: check_refcounts_l2(): check l2_bitmap qcow2-refcount: fix_l2_entry_by_zero(): also zero L2 entry bitmap qcow2-refcount: introduce fix_l2_entry_by_zero() qcow2: introduce qcow2_parse_compressed_l2_entry() helper qcow2: compressed read: simplify cluster descriptor passing qcow2-refcount: improve style of check_refcounts_l2() qemu-img: Allow target be aligned to sector size qcow2: handle_dependencies(): relax conflict detection qcow2: refactor handle_dependencies() loop body simplebench: add img_bench_templater.py block: bdrv_inactivate_recurse(): check for permissions and fix crash tests: add migrate-during-backup block/mirror: fix NULL pointer dereference in mirror_wait_on_conflicts() iotests/297: Cover tests/ mirror-top-perms: Fix AbnormalShutdown path ... Signed-off-by: Peter Maydell <[email protected]>
2 parents 7b7ab2d + 1899bf4 commit 57b6f58

26 files changed

+855
-217
lines changed

block.c

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
#include "qemu/timer.h"
5050
#include "qemu/cutils.h"
5151
#include "qemu/id.h"
52+
#include "qemu/range.h"
53+
#include "qemu/rcu.h"
5254
#include "block/coroutines.h"
5355

5456
#ifdef CONFIG_BSD
@@ -401,6 +403,9 @@ BlockDriverState *bdrv_new(void)
401403

402404
qemu_co_queue_init(&bs->flush_queue);
403405

406+
qemu_co_mutex_init(&bs->bsc_modify_lock);
407+
bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
408+
404409
for (i = 0; i < bdrv_drain_all_count; i++) {
405410
bdrv_drained_begin(bs);
406411
}
@@ -4694,6 +4699,8 @@ static void bdrv_close(BlockDriverState *bs)
46944699
bs->explicit_options = NULL;
46954700
qobject_unref(bs->full_open_options);
46964701
bs->full_open_options = NULL;
4702+
g_free(bs->block_status_cache);
4703+
bs->block_status_cache = NULL;
46974704

46984705
bdrv_release_named_dirty_bitmaps(bs);
46994706
assert(QLIST_EMPTY(&bs->dirty_bitmaps));
@@ -6319,6 +6326,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
63196326
{
63206327
BdrvChild *child, *parent;
63216328
int ret;
6329+
uint64_t cumulative_perms, cumulative_shared_perms;
63226330

63236331
if (!bs->drv) {
63246332
return -ENOMEDIUM;
@@ -6349,6 +6357,13 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
63496357
}
63506358
}
63516359

6360+
bdrv_get_cumulative_perm(bs, &cumulative_perms,
6361+
&cumulative_shared_perms);
6362+
if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
6363+
/* Our inactive parents still need write access. Inactivation failed. */
6364+
return -EPERM;
6365+
}
6366+
63526367
bs->open_flags |= BDRV_O_INACTIVE;
63536368

63546369
/*
@@ -7684,3 +7699,76 @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
76847699
{
76857700
return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
76867701
}
7702+
7703+
/**
7704+
* Check whether [offset, offset + bytes) overlaps with the cached
7705+
* block-status data region.
7706+
*
7707+
* If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
7708+
* which is what bdrv_bsc_is_data()'s interface needs.
7709+
* Otherwise, *pnum is not touched.
7710+
*/
7711+
static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
7712+
int64_t offset, int64_t bytes,
7713+
int64_t *pnum)
7714+
{
7715+
BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
7716+
bool overlaps;
7717+
7718+
overlaps =
7719+
qatomic_read(&bsc->valid) &&
7720+
ranges_overlap(offset, bytes, bsc->data_start,
7721+
bsc->data_end - bsc->data_start);
7722+
7723+
if (overlaps && pnum) {
7724+
*pnum = bsc->data_end - offset;
7725+
}
7726+
7727+
return overlaps;
7728+
}
7729+
7730+
/**
7731+
* See block_int.h for this function's documentation.
7732+
*/
7733+
bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
7734+
{
7735+
RCU_READ_LOCK_GUARD();
7736+
7737+
return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
7738+
}
7739+
7740+
/**
7741+
* See block_int.h for this function's documentation.
7742+
*/
7743+
void bdrv_bsc_invalidate_range(BlockDriverState *bs,
7744+
int64_t offset, int64_t bytes)
7745+
{
7746+
RCU_READ_LOCK_GUARD();
7747+
7748+
if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
7749+
qatomic_set(&bs->block_status_cache->valid, false);
7750+
}
7751+
}
7752+
7753+
/**
7754+
* See block_int.h for this function's documentation.
7755+
*/
7756+
void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
7757+
{
7758+
BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
7759+
BdrvBlockStatusCache *old_bsc;
7760+
7761+
*new_bsc = (BdrvBlockStatusCache) {
7762+
.valid = true,
7763+
.data_start = offset,
7764+
.data_end = offset + bytes,
7765+
};
7766+
7767+
QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
7768+
7769+
old_bsc = qatomic_rcu_read(&bs->block_status_cache);
7770+
qatomic_rcu_set(&bs->block_status_cache, new_bsc);
7771+
if (old_bsc) {
7772+
g_free_rcu(old_bsc, rcu);
7773+
}
7774+
}

block/file-posix.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2744,7 +2744,8 @@ static int find_allocation(BlockDriverState *bs, off_t start,
27442744
* the specified offset) that are known to be in the same
27452745
* allocated/unallocated state.
27462746
*
2747-
* 'bytes' is the max value 'pnum' should be set to.
2747+
* 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may
2748+
* well exceed it.
27482749
*/
27492750
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
27502751
bool want_zero,
@@ -2782,7 +2783,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
27822783
} else if (data == offset) {
27832784
/* On a data extent, compute bytes to the end of the extent,
27842785
* possibly including a partial sector at EOF. */
2785-
*pnum = MIN(bytes, hole - offset);
2786+
*pnum = hole - offset;
27862787

27872788
/*
27882789
* We are not allowed to return partial sectors, though, so
@@ -2801,7 +2802,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
28012802
} else {
28022803
/* On a hole, compute bytes to the beginning of the next extent. */
28032804
assert(hole == offset);
2804-
*pnum = MIN(bytes, data - offset);
2805+
*pnum = data - offset;
28052806
ret = BDRV_BLOCK_ZERO;
28062807
}
28072808
*map = offset;

block/gluster.c

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,7 +1461,8 @@ static int find_allocation(BlockDriverState *bs, off_t start,
14611461
* the specified offset) that are known to be in the same
14621462
* allocated/unallocated state.
14631463
*
1464-
* 'bytes' is the max value 'pnum' should be set to.
1464+
* 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may
1465+
* well exceed it.
14651466
*
14661467
* (Based on raw_co_block_status() from file-posix.c.)
14671468
*/
@@ -1477,6 +1478,8 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
14771478
off_t data = 0, hole = 0;
14781479
int ret = -EINVAL;
14791480

1481+
assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
1482+
14801483
if (!s->fd) {
14811484
return ret;
14821485
}
@@ -1500,12 +1503,26 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
15001503
} else if (data == offset) {
15011504
/* On a data extent, compute bytes to the end of the extent,
15021505
* possibly including a partial sector at EOF. */
1503-
*pnum = MIN(bytes, hole - offset);
1506+
*pnum = hole - offset;
1507+
1508+
/*
1509+
* We are not allowed to return partial sectors, though, so
1510+
* round up if necessary.
1511+
*/
1512+
if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
1513+
int64_t file_length = qemu_gluster_getlength(bs);
1514+
if (file_length > 0) {
1515+
/* Ignore errors, this is just a safeguard */
1516+
assert(hole == file_length);
1517+
}
1518+
*pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
1519+
}
1520+
15041521
ret = BDRV_BLOCK_DATA;
15051522
} else {
15061523
/* On a hole, compute bytes to the beginning of the next extent. */
15071524
assert(hole == offset);
1508-
*pnum = MIN(bytes, data - offset);
1525+
*pnum = data - offset;
15091526
ret = BDRV_BLOCK_ZERO;
15101527
}
15111528

block/io.c

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1883,6 +1883,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
18831883
return -ENOTSUP;
18841884
}
18851885

1886+
/* Invalidate the cached block-status data range if this write overlaps */
1887+
bdrv_bsc_invalidate_range(bs, offset, bytes);
1888+
18861889
assert(alignment % bs->bl.request_alignment == 0);
18871890
head = offset % alignment;
18881891
tail = (offset + bytes) % alignment;
@@ -2447,9 +2450,65 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
24472450
aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
24482451

24492452
if (bs->drv->bdrv_co_block_status) {
2450-
ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2451-
aligned_bytes, pnum, &local_map,
2452-
&local_file);
2453+
/*
2454+
* Use the block-status cache only for protocol nodes: Format
2455+
* drivers are generally quick to inquire the status, but protocol
2456+
* drivers often need to get information from outside of qemu, so
2457+
* we do not have control over the actual implementation. There
2458+
* have been cases where inquiring the status took an unreasonably
2459+
* long time, and we can do nothing in qemu to fix it.
2460+
* This is especially problematic for images with large data areas,
2461+
* because finding the few holes in them and giving them special
2462+
* treatment does not gain much performance. Therefore, we try to
2463+
* cache the last-identified data region.
2464+
*
2465+
* Second, limiting ourselves to protocol nodes allows us to assume
2466+
* the block status for data regions to be DATA | OFFSET_VALID, and
2467+
* that the host offset is the same as the guest offset.
2468+
*
2469+
* Note that it is possible that external writers zero parts of
2470+
* the cached regions without the cache being invalidated, and so
2471+
* we may report zeroes as data. This is not catastrophic,
2472+
* however, because reporting zeroes as data is fine.
2473+
*/
2474+
if (QLIST_EMPTY(&bs->children) &&
2475+
bdrv_bsc_is_data(bs, aligned_offset, pnum))
2476+
{
2477+
ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2478+
local_file = bs;
2479+
local_map = aligned_offset;
2480+
} else {
2481+
ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2482+
aligned_bytes, pnum, &local_map,
2483+
&local_file);
2484+
2485+
/*
2486+
* Note that checking QLIST_EMPTY(&bs->children) is also done when
2487+
* the cache is queried above. Technically, we do not need to check
2488+
* it here; the worst that can happen is that we fill the cache for
2489+
* non-protocol nodes, and then it is never used. However, filling
2490+
* the cache requires an RCU update, so double check here to avoid
2491+
* such an update if possible.
2492+
*/
2493+
if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
2494+
QLIST_EMPTY(&bs->children))
2495+
{
2496+
/*
2497+
* When a protocol driver reports BLOCK_OFFSET_VALID, the
2498+
* returned local_map value must be the same as the offset we
2499+
* have passed (aligned_offset), and local_bs must be the node
2500+
* itself.
2501+
* Assert this, because we follow this rule when reading from
2502+
* the cache (see the `local_file = bs` and
2503+
* `local_map = aligned_offset` assignments above), and the
2504+
* result the cache delivers must be the same as the driver
2505+
* would deliver.
2506+
*/
2507+
assert(local_file == bs);
2508+
assert(local_map == aligned_offset);
2509+
bdrv_bsc_fill(bs, aligned_offset, *pnum);
2510+
}
2511+
}
24532512
} else {
24542513
/* Default code for filters */
24552514

@@ -3002,6 +3061,9 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
30023061
return 0;
30033062
}
30043063

3064+
/* Invalidate the cached block-status data range if this discard overlaps */
3065+
bdrv_bsc_invalidate_range(bs, offset, bytes);
3066+
30053067
/* Discard is advisory, but some devices track and coalesce
30063068
* unaligned requests, so we must pass everything down rather than
30073069
* round here. Still, most devices will just silently ignore

block/iscsi.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -781,9 +781,6 @@ static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs,
781781
iscsi_allocmap_set_allocated(iscsilun, offset, *pnum);
782782
}
783783

784-
if (*pnum > bytes) {
785-
*pnum = bytes;
786-
}
787784
out_unlock:
788785
qemu_mutex_unlock(&iscsilun->mutex);
789786
g_free(iTask.err_str);

block/mirror.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -160,18 +160,25 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
160160
if (ranges_overlap(self_start_chunk, self_nb_chunks,
161161
op_start_chunk, op_nb_chunks))
162162
{
163-
/*
164-
* If the operation is already (indirectly) waiting for us, or
165-
* will wait for us as soon as it wakes up, then just go on
166-
* (instead of producing a deadlock in the former case).
167-
*/
168-
if (op->waiting_for_op) {
169-
continue;
163+
if (self) {
164+
/*
165+
* If the operation is already (indirectly) waiting for us,
166+
* or will wait for us as soon as it wakes up, then just go
167+
* on (instead of producing a deadlock in the former case).
168+
*/
169+
if (op->waiting_for_op) {
170+
continue;
171+
}
172+
173+
self->waiting_for_op = op;
170174
}
171175

172-
self->waiting_for_op = op;
173176
qemu_co_queue_wait(&op->waiting_requests, NULL);
174-
self->waiting_for_op = NULL;
177+
178+
if (self) {
179+
self->waiting_for_op = NULL;
180+
}
181+
175182
break;
176183
}
177184
}

0 commit comments

Comments
 (0)