nvme: optimize passthrough IOPOLL completion for local ring context

Ming Lei · kawasaki · commit a17a32fccb52 · 2026-01-15T18:09:47.000+09:00
When multiple io_uring rings poll on the same NVMe queue, one ring can
find completions belonging to another ring. The current code always
uses task_work to handle this, but this adds overhead for the common
single-ring case.

This patch passes the polling io_ring_ctx through the iopoll callback
chain via io_comp_batch and stores it in the request. In the NVMe
end_io handler, we compare the polling context with the request's
owning context. If they match (local), we complete inline. If they
differ (remote) or it's a non-IOPOLL path, we use task_work as before.

Changes:
- Add poll_ctx field to struct io_comp_batch
- Add poll_ctx to struct request's hash/ipi_list union
- Set iob.poll_ctx in io_do_iopoll() before calling iopoll callbacks
- Store poll_ctx in request in nvme_ns_chr_uring_cmd_iopoll()
- Check local vs remote context in nvme_uring_cmd_end_io()

~10% IOPS improvement is observed in the following benchmark:

fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B[0|1] -O0 -P1 -u1 -n1 /dev/ng0n1

Signed-off-by: Ming Lei &lt;ming.lei@redhat.com&gt;
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
@@ -425,14 +425,28 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
 
 	/*
-	 * IOPOLL could potentially complete this request directly, but
-	 * if multiple rings are polling on the same queue, then it's possible
-	 * for one ring to find completions for another ring. Punting the
-	 * completion via task_work will always direct it to the right
-	 * location, rather than potentially complete requests for ringA
-	 * under iopoll invocations from ringB.
+	 * For IOPOLL, check if this completion is happening in the context
+	 * of the same io_ring that owns the request (local context). If so,
+	 * we can complete inline without task_work overhead. Otherwise, we
+	 * must punt to task_work to ensure completion happens in the correct
+	 * ring's context.
 	 */
-	io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+	if (blk_rq_is_poll(req) && req->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) {
+		/*
+		 * Local context: the polling ring owns this request.
+		 * Complete inline for optimal performance.
+		 */
+		if (pdu->bio)
+			blk_rq_unmap_user(pdu->bio);
+		io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0);
+	} else {
+		/*
+		 * Remote or non-IOPOLL context: either a different ring found
+		 * this completion, or this is IRQ/softirq completion. Use
+		 * task_work to direct completion to the correct location.
+		 */
+		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+	}
 	return RQ_END_IO_FREE;
 }
 
@@ -677,8 +691,14 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
 	struct request *req = pdu->req;
 
-	if (req && blk_rq_is_poll(req))
+	if (req && blk_rq_is_poll(req)) {
+		/*
+		 * Store the polling context in the request so end_io can
+		 * detect if it's completing in the local ring's context.
+		 */
+		req->poll_ctx = iob ? iob->poll_ctx : NULL;
 		return blk_rq_poll(req, iob, poll_flags);
+	}
 	return 0;
 }
 #ifdef CONFIG_NVME_MULTIPATH
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
@@ -175,11 +175,13 @@ struct request {
 	 * request reaches the dispatch list. The ipi_list is only used
 	 * to queue the request for softirq completion, which is long
 	 * after the request has been unhashed (and even removed from
-	 * the dispatch list).
+	 * the dispatch list). poll_ctx is used during iopoll to track
+	 * the io_ring_ctx that initiated the poll operation.
 	 */
 	union {
 		struct hlist_node hash;	/* merge hash */
 		struct llist_node ipi_list;
+		void *poll_ctx;		/* iopoll context */
 	};
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
@@ -1820,6 +1820,7 @@ void bdev_fput(struct file *bdev_file);
 
 struct io_comp_batch {
 	struct rq_list req_list;
+	void *poll_ctx;
 	bool need_ts;
 	void (*complete)(struct io_comp_batch *);
 };
diff --git a/io_uring/rw.c b/io_uring/rw.c
@@ -1320,6 +1320,13 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	DEFINE_IO_COMP_BATCH(iob);
 	int nr_events = 0;
 
+	/*
+	 * Store the polling ctx so drivers can detect if they're completing
+	 * a request from the same ring that's polling (local) vs a different
+	 * ring (remote). This enables optimizations for local completions.
+	 */
+	iob.poll_ctx = ctx;
+
 	/*
 	 * Only spin for completions if we don't have multiple devices hanging
 	 * off our complete list.