Skip to content

Commit 7838c67

Browse files
committed
block/nvme: support nested aio_poll()
QEMU block drivers are supposed to support aio_poll() from I/O completion callback functions. This means completion processing must be re-entrant. The standard approach is to schedule a BH during completion processing and cancel it at the end of processing. If aio_poll() is invoked by a callback function then the BH will run. The BH continues the suspended completion processing. All of this means that request A's cb() can synchronously wait for request B to complete. Previously the nvme block driver would hang because it didn't process completions from nested aio_poll(). Signed-off-by: Stefan Hajnoczi <[email protected]> Reviewed-by: Sergio Lopez <[email protected]> Message-id: [email protected] Signed-off-by: Stefan Hajnoczi <[email protected]>
1 parent b75fd5f commit 7838c67

File tree

2 files changed

+58
-7
lines changed

2 files changed

+58
-7
lines changed

block/nvme.c

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,11 @@ typedef struct {
7474
int cq_phase;
7575
int free_req_head;
7676
NVMeRequest reqs[NVME_NUM_REQS];
77-
bool busy;
7877
int need_kick;
7978
int inflight;
79+
80+
/* Thread-safe, no lock necessary */
81+
QEMUBH *completion_bh;
8082
} NVMeQueuePair;
8183

8284
/* Memory mapped registers */
@@ -140,6 +142,8 @@ struct BDRVNVMeState {
140142
#define NVME_BLOCK_OPT_DEVICE "device"
141143
#define NVME_BLOCK_OPT_NAMESPACE "namespace"
142144

145+
static void nvme_process_completion_bh(void *opaque);
146+
143147
static QemuOptsList runtime_opts = {
144148
.name = "nvme",
145149
.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -181,6 +185,9 @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
181185

182186
static void nvme_free_queue_pair(NVMeQueuePair *q)
183187
{
188+
if (q->completion_bh) {
189+
qemu_bh_delete(q->completion_bh);
190+
}
184191
qemu_vfree(q->prp_list_pages);
185192
qemu_vfree(q->sq.queue);
186193
qemu_vfree(q->cq.queue);
@@ -214,6 +221,8 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
214221
q->index = idx;
215222
qemu_co_queue_init(&q->free_req_queue);
216223
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
224+
q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
225+
nvme_process_completion_bh, q);
217226
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
218227
s->page_size * NVME_NUM_REQS,
219228
false, &prp_list_iova);
@@ -352,11 +361,21 @@ static bool nvme_process_completion(NVMeQueuePair *q)
352361
NvmeCqe *c;
353362

354363
trace_nvme_process_completion(s, q->index, q->inflight);
355-
if (q->busy || s->plugged) {
356-
trace_nvme_process_completion_queue_busy(s, q->index);
364+
if (s->plugged) {
365+
trace_nvme_process_completion_queue_plugged(s, q->index);
357366
return false;
358367
}
359-
q->busy = true;
368+
369+
/*
370+
* Support re-entrancy when a request cb() function invokes aio_poll().
371+
* Pending completions must be visible to aio_poll() so that a cb()
372+
* function can wait for the completion of another request.
373+
*
374+
* The aio_poll() loop will execute our BH and we'll resume completion
375+
* processing there.
376+
*/
377+
qemu_bh_schedule(q->completion_bh);
378+
360379
assert(q->inflight >= 0);
361380
while (q->inflight) {
362381
int ret;
@@ -384,10 +403,10 @@ static bool nvme_process_completion(NVMeQueuePair *q)
384403
assert(req.cb);
385404
nvme_put_free_req_locked(q, preq);
386405
preq->cb = preq->opaque = NULL;
406+
q->inflight--;
387407
qemu_mutex_unlock(&q->lock);
388408
req.cb(req.opaque, ret);
389409
qemu_mutex_lock(&q->lock);
390-
q->inflight--;
391410
progress = true;
392411
}
393412
if (progress) {
@@ -396,10 +415,28 @@ static bool nvme_process_completion(NVMeQueuePair *q)
396415
*q->cq.doorbell = cpu_to_le32(q->cq.head);
397416
nvme_wake_free_req_locked(q);
398417
}
399-
q->busy = false;
418+
419+
qemu_bh_cancel(q->completion_bh);
420+
400421
return progress;
401422
}
402423

424+
static void nvme_process_completion_bh(void *opaque)
425+
{
426+
NVMeQueuePair *q = opaque;
427+
428+
/*
429+
* We're being invoked because a nvme_process_completion() cb() function
430+
* called aio_poll(). The callback may be waiting for further completions
431+
* so notify the device that it has space to fill in more completions now.
432+
*/
433+
smp_mb_release();
434+
*q->cq.doorbell = cpu_to_le32(q->cq.head);
435+
nvme_wake_free_req_locked(q);
436+
437+
nvme_process_completion(q);
438+
}
439+
403440
static void nvme_trace_command(const NvmeCmd *cmd)
404441
{
405442
int i;
@@ -1309,6 +1346,13 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
13091346
{
13101347
BDRVNVMeState *s = bs->opaque;
13111348

1349+
for (int i = 0; i < s->nr_queues; i++) {
1350+
NVMeQueuePair *q = s->queues[i];
1351+
1352+
qemu_bh_delete(q->completion_bh);
1353+
q->completion_bh = NULL;
1354+
}
1355+
13121356
aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
13131357
false, NULL, NULL);
13141358
}
@@ -1321,6 +1365,13 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
13211365
s->aio_context = new_context;
13221366
aio_set_event_notifier(new_context, &s->irq_notifier,
13231367
false, nvme_handle_event, nvme_poll_cb);
1368+
1369+
for (int i = 0; i < s->nr_queues; i++) {
1370+
NVMeQueuePair *q = s->queues[i];
1371+
1372+
q->completion_bh =
1373+
aio_bh_new(new_context, nvme_process_completion_bh, q);
1374+
}
13241375
}
13251376

13261377
static void nvme_aio_plug(BlockDriverState *bs)

block/trace-events

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ nvme_kick(void *s, int queue) "s %p queue %d"
158158
nvme_dma_flush_queue_wait(void *s) "s %p"
159159
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
160160
nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
161-
nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
161+
nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
162162
nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
163163
nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
164164
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"

0 commit comments

Comments
 (0)