Skip to content

Commit 1086e95

Browse files
committed
block/nvme: switch to a NVMeRequest freelist
There are three issues with the current NVMeRequest->busy field: 1. The busy field is accidentally accessed outside q->lock when request submission fails. 2. Waiters on free_req_queue are not woken when a request is returned early due to submission failure. 2. Finding a free request involves scanning all requests. This makes request submission O(n^2). Switch to an O(1) freelist that is always accessed under the lock. Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and NVME_NUM_REQS, the number of usable requests. This makes the code simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind that one slot is reserved. Signed-off-by: Stefan Hajnoczi <[email protected]> Reviewed-by: Sergio Lopez <[email protected]> Message-id: [email protected] Signed-off-by: Stefan Hajnoczi <[email protected]>
1 parent 04b3fb3 commit 1086e95

File tree

1 file changed

+54
-27
lines changed

1 file changed

+54
-27
lines changed

block/nvme.c

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@
3333
#define NVME_QUEUE_SIZE 128
3434
#define NVME_BAR_SIZE 8192
3535

36+
/*
37+
* We have to leave one slot empty as that is the full queue case where
38+
* head == tail + 1.
39+
*/
40+
#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
41+
3642
typedef struct {
3743
int32_t head, tail;
3844
uint8_t *queue;
@@ -47,7 +53,7 @@ typedef struct {
4753
int cid;
4854
void *prp_list_page;
4955
uint64_t prp_list_iova;
50-
bool busy;
56+
int free_req_next; /* q->reqs[] index of next free req */
5157
} NVMeRequest;
5258

5359
typedef struct {
@@ -61,7 +67,8 @@ typedef struct {
6167
/* Fields protected by @lock */
6268
NVMeQueue sq, cq;
6369
int cq_phase;
64-
NVMeRequest reqs[NVME_QUEUE_SIZE];
70+
int free_req_head;
71+
NVMeRequest reqs[NVME_NUM_REQS];
6572
bool busy;
6673
int need_kick;
6774
int inflight;
@@ -200,19 +207,23 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
200207
qemu_mutex_init(&q->lock);
201208
q->index = idx;
202209
qemu_co_queue_init(&q->free_req_queue);
203-
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
210+
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
204211
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
205-
s->page_size * NVME_QUEUE_SIZE,
212+
s->page_size * NVME_NUM_REQS,
206213
false, &prp_list_iova);
207214
if (r) {
208215
goto fail;
209216
}
210-
for (i = 0; i < NVME_QUEUE_SIZE; i++) {
217+
q->free_req_head = -1;
218+
for (i = 0; i < NVME_NUM_REQS; i++) {
211219
NVMeRequest *req = &q->reqs[i];
212220
req->cid = i + 1;
221+
req->free_req_next = q->free_req_head;
222+
q->free_req_head = i;
213223
req->prp_list_page = q->prp_list_pages + i * s->page_size;
214224
req->prp_list_iova = prp_list_iova + i * s->page_size;
215225
}
226+
216227
nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
217228
if (local_err) {
218229
error_propagate(errp, local_err);
@@ -254,13 +265,11 @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
254265
*/
255266
static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
256267
{
257-
int i;
258-
NVMeRequest *req = NULL;
268+
NVMeRequest *req;
259269

260270
qemu_mutex_lock(&q->lock);
261-
while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
262-
/* We have to leave one slot empty as that is the full queue case (head
263-
* == tail + 1). */
271+
272+
while (q->free_req_head == -1) {
264273
if (qemu_in_coroutine()) {
265274
trace_nvme_free_req_queue_wait(q);
266275
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -269,20 +278,41 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
269278
return NULL;
270279
}
271280
}
272-
for (i = 0; i < NVME_QUEUE_SIZE; i++) {
273-
if (!q->reqs[i].busy) {
274-
q->reqs[i].busy = true;
275-
req = &q->reqs[i];
276-
break;
277-
}
278-
}
279-
/* We have checked inflight and need_kick while holding q->lock, so one
280-
* free req must be available. */
281-
assert(req);
281+
282+
req = &q->reqs[q->free_req_head];
283+
q->free_req_head = req->free_req_next;
284+
req->free_req_next = -1;
285+
282286
qemu_mutex_unlock(&q->lock);
283287
return req;
284288
}
285289

290+
/* With q->lock */
291+
static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
292+
{
293+
req->free_req_next = q->free_req_head;
294+
q->free_req_head = req - q->reqs;
295+
}
296+
297+
/* With q->lock */
298+
static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
299+
{
300+
if (!qemu_co_queue_empty(&q->free_req_queue)) {
301+
replay_bh_schedule_oneshot_event(s->aio_context,
302+
nvme_free_req_queue_cb, q);
303+
}
304+
}
305+
306+
/* Insert a request in the freelist and wake waiters */
307+
static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
308+
NVMeRequest *req)
309+
{
310+
qemu_mutex_lock(&q->lock);
311+
nvme_put_free_req_locked(q, req);
312+
nvme_wake_free_req_locked(s, q);
313+
qemu_mutex_unlock(&q->lock);
314+
}
315+
286316
static inline int nvme_translate_error(const NvmeCqe *c)
287317
{
288318
uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -344,7 +374,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
344374
req = *preq;
345375
assert(req.cid == cid);
346376
assert(req.cb);
347-
preq->busy = false;
377+
nvme_put_free_req_locked(q, preq);
348378
preq->cb = preq->opaque = NULL;
349379
qemu_mutex_unlock(&q->lock);
350380
req.cb(req.opaque, ret);
@@ -356,10 +386,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
356386
/* Notify the device so it can post more completions. */
357387
smp_mb_release();
358388
*q->cq.doorbell = cpu_to_le32(q->cq.head);
359-
if (!qemu_co_queue_empty(&q->free_req_queue)) {
360-
replay_bh_schedule_oneshot_event(s->aio_context,
361-
nvme_free_req_queue_cb, q);
362-
}
389+
nvme_wake_free_req_locked(s, q);
363390
}
364391
q->busy = false;
365392
return progress;
@@ -1001,7 +1028,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
10011028
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
10021029
qemu_co_mutex_unlock(&s->dma_map_lock);
10031030
if (r) {
1004-
req->busy = false;
1031+
nvme_put_free_req_and_wake(s, ioq, req);
10051032
return r;
10061033
}
10071034
nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -1218,7 +1245,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
12181245
qemu_co_mutex_unlock(&s->dma_map_lock);
12191246

12201247
if (ret) {
1221-
req->busy = false;
1248+
nvme_put_free_req_and_wake(s, ioq, req);
12221249
goto out;
12231250
}
12241251

0 commit comments

Comments
 (0)