diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 8c4030bcabb6..6ad28039663d 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -260,9 +260,12 @@ The following IO commands are communicated via io_uring passthrough command, and each command is only for forwarding the IO and committing the result with specified IO tag in the command data: -- ``UBLK_IO_FETCH_REQ`` +Traditional Per-I/O Commands +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Sent from the server IO pthread for fetching future incoming IO requests +- ``UBLK_U_IO_FETCH_REQ`` + + Sent from the server I/O pthread for fetching future incoming I/O requests destined to ``/dev/ublkb*``. This command is sent only once from the server IO pthread for ublk driver to setup IO forward environment. @@ -278,7 +281,7 @@ with specified IO tag in the command data: supported by the driver, daemons must be per-queue instead - i.e. all I/Os associated to a single qid must be handled by the same task. -- ``UBLK_IO_COMMIT_AND_FETCH_REQ`` +- ``UBLK_U_IO_COMMIT_AND_FETCH_REQ`` When an IO request is destined to ``/dev/ublkb*``, the driver stores the IO's ``ublksrv_io_desc`` to the specified mapped area; then the @@ -293,7 +296,7 @@ with specified IO tag in the command data: requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ`` is reused for both fetching request and committing back IO result. -- ``UBLK_IO_NEED_GET_DATA`` +- ``UBLK_U_IO_NEED_GET_DATA`` With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly issued to ublk server without data copy. Then, IO backend of ublk server @@ -322,6 +325,59 @@ with specified IO tag in the command data: ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy the server buffer (pages) read to the IO request pages. +Batch I/O Commands (UBLK_F_BATCH_IO) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``UBLK_F_BATCH_IO`` feature provides an alternative high-performance +I/O handling model that replaces the traditional per-I/O commands with +per-queue batch commands. This significantly reduces communication overhead +and enables better load balancing across multiple server tasks. + +Key differences from traditional mode: + +- **Per-queue vs Per-I/O**: Commands operate on queues rather than individual I/Os +- **Batch processing**: Multiple I/Os are handled in single operations +- **Multishot commands**: Use io_uring multishot for reduced submission overhead +- **Flexible task assignment**: Any task can handle any I/O (no per-I/O daemons) +- **Better load balancing**: Tasks can adjust their workload dynamically + +Batch I/O Commands: + +- ``UBLK_U_IO_PREP_IO_CMDS`` + + Prepares multiple I/O commands in batch. The server provides a buffer + containing multiple I/O descriptors that will be processed together. + This reduces the number of individual command submissions required. + +- ``UBLK_U_IO_COMMIT_IO_CMDS`` + + Commits results for multiple I/O operations in batch, and prepares the + I/O descriptors to accept new requests. The server provides a buffer + containing the results of multiple completed I/Os, allowing efficient + bulk completion of requests. + +- ``UBLK_U_IO_FETCH_IO_CMDS`` + + **Multishot command** for fetching I/O commands in batch. This is the key + command that enables high-performance batch processing: + + * Uses io_uring multishot capability for reduced submission overhead + * Single command can fetch multiple I/O requests over time + * Buffer size determines maximum batch size per operation + * Multiple fetch commands can be submitted for load balancing + * Only one fetch command is active at any time per queue + * Supports dynamic load balancing across multiple server tasks + + It is one typical multishot io_uring request with provided buffer, and it + won't be completed until any failure is triggered. + + Each task can submit ``UBLK_U_IO_FETCH_IO_CMDS`` with different buffer + sizes to control how much work it handles. This enables sophisticated + load balancing strategies in multi-threaded servers. + +Migration: Applications using traditional commands (``UBLK_U_IO_FETCH_REQ``, +``UBLK_U_IO_COMMIT_AND_FETCH_REQ``) cannot use batch mode simultaneously. + Zero copy --------- diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index aaf94d2fb789..1e374ecbf0f1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +79,8 @@ | UBLK_F_PER_IO_DAEMON \ | UBLK_F_BUF_REG_OFF_DAEMON \ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ - | UBLK_F_SAFE_STOP_DEV) + | UBLK_F_SAFE_STOP_DEV \ + | UBLK_F_BATCH_IO) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -91,6 +93,18 @@ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ UBLK_PARAM_TYPE_INTEGRITY) +#define UBLK_BATCH_F_ALL \ + (UBLK_BATCH_F_HAS_ZONE_LBA | \ + UBLK_BATCH_F_HAS_BUF_ADDR | \ + UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) + +/* ublk batch fetch uring_cmd */ +struct ublk_batch_fetch_cmd { + struct list_head node; + struct io_uring_cmd *cmd; + unsigned short buf_group; +}; + struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -111,7 +125,18 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - u16 tag; + union { + u16 tag; + struct ublk_batch_fetch_cmd *fcmd; /* batch io only */ + }; +}; + +struct ublk_batch_io_data { + struct ublk_device *ub; + struct io_uring_cmd *cmd; + struct ublk_batch_io header; + unsigned int issue_flags; + struct io_comp_batch *iob; }; /* @@ -161,6 +186,9 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +/* used for UBLK_F_BATCH_IO only */ +#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1) + union ublk_io_buf { __u64 addr; struct ublk_auto_buf_reg auto_reg; @@ -196,6 +224,7 @@ struct ublk_io { unsigned task_registered_buffers; void *buf_ctx_handle; + spinlock_t lock; } ____cacheline_aligned_in_smp; struct ublk_queue { @@ -210,6 +239,52 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; + u32 nr_io_ready; + + /* + * For supporting UBLK_F_BATCH_IO only. + * + * Inflight ublk request tag is saved in this fifo + * + * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), + * so lock is required for storing request tag to fifo + * + * Make sure just one reader for fetching request from task work + * function to ublk server, so no need to grab the lock in reader + * side. + * + * Batch I/O State Management: + * + * The batch I/O system uses implicit state management based on the + * combination of three key variables below. + * + * - IDLE: list_empty(&fcmd_head) && !active_fcmd + * No fetch commands available, events queue in evts_fifo + * + * - READY: !list_empty(&fcmd_head) && !active_fcmd + * Fetch commands available but none processing events + * + * - ACTIVE: active_fcmd + * One fetch command actively processing events from evts_fifo + * + * Key Invariants: + * - At most one active_fcmd at any time (single reader) + * - active_fcmd is always from fcmd_head list when non-NULL + * - evts_fifo can be read locklessly by the single active reader + * - All state transitions require evts_lock protection + * - Multiple writers to evts_fifo require lock protection + */ + struct { + DECLARE_KFIFO_PTR(evts_fifo, unsigned short); + spinlock_t evts_lock; + + /* List of fetch commands available to process events */ + struct list_head fcmd_head; + + /* Currently active fetch command (NULL = none active) */ + struct ublk_batch_fetch_cmd *active_fcmd; + }____cacheline_aligned_in_smp; + struct ublk_io ios[] __counted_by(q_depth); }; @@ -237,7 +312,7 @@ struct ublk_device { struct ublk_params params; struct completion completion; - u32 nr_io_ready; + u32 nr_queue_ready; bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; @@ -262,6 +337,49 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); +static void ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd); + +static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_BATCH_IO; +} + +static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_BATCH_IO; +} + +static inline void ublk_io_lock(struct ublk_io *io) +{ + spin_lock(&io->lock); +} + +static inline void ublk_io_unlock(struct ublk_io *io) +{ + spin_unlock(&io->lock); +} + +/* Initialize the event queue */ +static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size, + int numa_node) +{ + spin_lock_init(&q->evts_lock); + return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node); +} + +/* Check if event queue is empty */ +static inline bool ublk_io_evts_empty(const struct ublk_queue *q) +{ + return kfifo_is_empty(&q->evts_fifo); +} + +static inline void ublk_io_evts_deinit(struct ublk_queue *q) +{ + WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo)); + kfifo_free(&q->evts_fifo); +} static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) @@ -575,7 +693,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map); + bool need_map, struct io_comp_batch *iob); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -588,6 +706,64 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static struct ublk_batch_fetch_cmd * +ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) +{ + struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO); + + if (fcmd) { + fcmd->cmd = cmd; + fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); + } + return fcmd; +} + +static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd) +{ + kfree(fcmd); +} + +static void __ublk_release_fcmd(struct ublk_queue *ubq) +{ + WRITE_ONCE(ubq->active_fcmd, NULL); +} + +/* + * Nothing can move on, so clear ->active_fcmd, and the caller should stop + * dispatching + */ +static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd, + int res) +{ + spin_lock(&ubq->evts_lock); + list_del(&fcmd->node); + WARN_ON_ONCE(fcmd != ubq->active_fcmd); + __ublk_release_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); + ublk_batch_free_fcmd(fcmd); +} + +static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, + struct io_br_sel *sel, + unsigned int issue_flags) +{ + if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags)) + return -ENOBUFS; + return 0; +} + +static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd, + void __user *buf, const u16 *tag_buf, + unsigned int len) +{ + if (copy_to_user(buf, tag_buf, len)) + return -EFAULT; + return len; +} #define UBLK_MAX_UBLKS UBLK_MINORS @@ -827,7 +1003,7 @@ static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) return; /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ - __ublk_complete_rq(req, io, false); + __ublk_complete_rq(req, io, false, NULL); } static inline bool ublk_sub_req_ref(struct ublk_io *io) @@ -1214,7 +1390,7 @@ static void ublk_end_request(struct request *req, blk_status_t error) /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map) + bool need_map, struct io_comp_batch *iob) { unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; @@ -1268,8 +1444,11 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, local_bh_enable(); if (requeue) blk_mq_requeue_request(req, true); - else if (likely(!blk_should_fake_timeout(req->q))) + else if (likely(!blk_should_fake_timeout(req->q))) { + if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch)) + return; __blk_mq_end_request(req, BLK_STS_OK); + } return; exit: @@ -1455,6 +1634,240 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) } } +static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short tag) +{ + struct ublk_device *ub = data->ub; + struct ublk_io *io = &ubq->ios[tag]; + struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK; + struct io_uring_cmd *cmd = data->cmd; + + if (!ublk_start_io(ubq, req, io)) + return false; + + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + data->issue_flags); + + if (res == AUTO_BUF_REG_FAIL) + return false; + } + + ublk_io_lock(io); + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + ublk_io_unlock(io); + + return true; +} + +static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short *tag_buf, + unsigned int len) +{ + bool has_unused = false; + unsigned int i; + + for (i = 0; i < len; i++) { + unsigned short tag = tag_buf[i]; + + if (!__ublk_batch_prep_dispatch(ubq, data, tag)) { + tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG; + has_unused = true; + } + } + + return has_unused; +} + +/* + * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf. + * Returns the new length after filtering. + */ +static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, + unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; i < len; i++) { + if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) { + if (i != j) + tag_buf[j] = tag_buf[i]; + j++; + } + } + + return j; +} + +#define MAX_NR_TAG 128 +static int __ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + const unsigned int tag_sz = sizeof(unsigned short); + unsigned short tag_buf[MAX_NR_TAG]; + struct io_br_sel sel; + size_t len = 0; + bool needs_filter; + int ret; + + WARN_ON_ONCE(data->cmd != fcmd->cmd); + + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, + data->issue_flags); + if (sel.val < 0) + return sel.val; + if (!sel.addr) + return -ENOBUFS; + + /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */ + len = min(len, sizeof(tag_buf)) / tag_sz; + len = kfifo_out(&ubq->evts_fifo, tag_buf, len); + + needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len); + /* Filter out unused tags before posting to userspace */ + if (unlikely(needs_filter)) { + int new_len = ublk_filter_unused_tags(tag_buf, len); + + /* return actual length if all are failed or requeued */ + if (!new_len) { + /* release the selected buffer */ + sel.val = 0; + WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd, + &sel, data->issue_flags)); + return len; + } + len = new_len; + } + + sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz); + ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); + if (unlikely(ret < 0)) { + int i, res; + + /* + * Undo prep state for all IOs since userspace never received them. + * This restores IOs to pre-prepared state so they can be cleanly + * re-prepared when tags are pulled from FIFO again. + */ + for (i = 0; i < len; i++) { + struct ublk_io *io = &ubq->ios[tag_buf[i]]; + int index = -1; + + ublk_io_lock(io); + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) + index = io->buf.auto_reg.index; + io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); + io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_io_unlock(io); + + if (index != -1) + io_buffer_unregister_bvec(data->cmd, index, + data->issue_flags); + } + + res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, + tag_buf, len, &ubq->evts_lock); + + pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " + "tags(%d %zu) ret %d\n", __func__, res, len, + ret); + } + return ret; +} + +static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd( + struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + + lockdep_assert_held(&ubq->evts_lock); + + /* + * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd. + * + * The pair is the smp_mb() in ublk_batch_dispatch(). + * + * If ubq->active_fcmd is observed as non-NULL, the new added tags + * can be visisible in ublk_batch_dispatch() with the barrier pairing. + */ + smp_mb(); + if (READ_ONCE(ubq->active_fcmd)) { + fcmd = NULL; + } else { + fcmd = list_first_entry_or_null(&ubq->fcmd_head, + struct ublk_batch_fetch_cmd, node); + WRITE_ONCE(ubq->active_fcmd, fcmd); + } + return fcmd; +} + +static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_batch_io_data data = { + .ub = pdu->ubq->dev, + .cmd = fcmd->cmd, + .issue_flags = issue_flags, + }; + + WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd); + + ublk_batch_dispatch(pdu->ubq, &data, fcmd); +} + +static void +ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + struct ublk_batch_fetch_cmd *new_fcmd; + unsigned tried = 0; + int ret = 0; + +again: + while (!ublk_io_evts_empty(ubq)) { + ret = __ublk_batch_dispatch(ubq, data, fcmd); + if (ret <= 0) + break; + } + + if (ret < 0) { + ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret); + return; + } + + __ublk_release_fcmd(ubq); + /* + * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and + * checking ubq->evts_fifo. + * + * The pair is the smp_mb() in __ublk_acquire_fcmd(). + */ + smp_mb(); + if (likely(ublk_io_evts_empty(ubq))) + return; + + spin_lock(&ubq->evts_lock); + new_fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (!new_fcmd) + return; + + /* Avoid lockup by allowing to handle at most 32 batches */ + if (new_fcmd == fcmd && tried++ < 32) + goto again; + + io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); @@ -1464,6 +1877,21 @@ static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) ublk_dispatch_req(ubq, pdu->req); } +static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last) +{ + unsigned short tag = rq->tag; + struct ublk_batch_fetch_cmd *fcmd = NULL; + + spin_lock(&ubq->evts_lock); + kfifo_put(&ubq->evts_fifo, tag); + if (last) + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; @@ -1553,16 +1981,22 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, return BLK_STS_OK; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +/* + * Common helper for queue_rq that handles request preparation and + * cancellation checks. Returns status and sets should_queue to indicate + * whether the caller should proceed with queuing the request. + */ +static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq, + struct request *rq, + bool *should_queue) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; res = ublk_prep_req(ubq, rq, false); - if (res != BLK_STS_OK) + if (res != BLK_STS_OK) { + *should_queue = false; return res; + } /* * ->canceling has to be handled after ->force_abort and ->fail_io @@ -1570,14 +2004,47 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * of recovery, and cause hang when deleting disk */ if (unlikely(ubq->canceling)) { + *should_queue = false; __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } + *should_queue = true; + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + bool should_queue; + blk_status_t res; + + res = __ublk_queue_rq_common(ubq, rq, &should_queue); + if (!should_queue) + return res; + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } +static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + bool should_queue; + blk_status_t res; + + res = __ublk_queue_rq_common(ubq, rq, &should_queue); + if (!should_queue) + return res; + + ublk_batch_queue_cmd(ubq, rq, bd->last); + return BLK_STS_OK; +} + static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, const struct ublk_io *io2) { @@ -1586,6 +2053,19 @@ static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, (io->task == io2->task); } +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct ublk_batch_fetch_cmd *fcmd; + + spin_lock(&ubq->evts_lock); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; @@ -1614,6 +2094,57 @@ static void ublk_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } +static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + unsigned short tags[MAX_NR_TAG]; + struct ublk_batch_fetch_cmd *fcmd; + struct request *rq; + unsigned cnt = 0; + + spin_lock(&ubq->evts_lock); + rq_list_for_each(l, rq) { + tags[cnt++] = (unsigned short)rq->tag; + if (cnt >= MAX_NR_TAG) { + kfifo_in(&ubq->evts_fifo, tags, cnt); + cnt = 0; + } + } + if (cnt) + kfifo_in(&ubq->evts_fifo, tags, cnt); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + rq_list_init(l); + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + +static void ublk_batch_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { + rq_list_add_tail(&requeue_list, req); + continue; + } + + if (ubq && this_q != ubq && !rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + rq_list_add_tail(&submit_list, req); + } + + if (!rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1631,10 +2162,20 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, }; +static const struct blk_mq_ops ublk_batch_mq_ops = { + .commit_rqs = ublk_commit_rqs, + .queue_rq = ublk_batch_queue_rq, + .queue_rqs = ublk_batch_queue_rqs, + .init_hctx = ublk_init_hctx, + .timeout = ublk_timeout, +}; + static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; + ubq->nr_io_ready = 0; + for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1683,7 +2224,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; - ub->nr_io_ready = 0; + ub->nr_queue_ready = 0; ub->unprivileged_daemons = false; ub->ublksrv_tgid = -1; } @@ -1937,13 +2478,32 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, struct request *req) { - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) && + io->flags & UBLK_IO_FLAG_ACTIVE); if (ublk_nosrv_should_reissue_outstanding(ub)) blk_mq_requeue_request(req, false); else { io->res = -EIO; - __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); + } +} + +/* + * Request tag may just be filled to event kfifo, not get chance to + * dispatch, abort these requests too + */ +static void ublk_abort_batch_queue(struct ublk_device *ub, + struct ublk_queue *ubq) +{ + unsigned short tag; + + while (kfifo_out(&ubq->evts_fifo, &tag, 1)) { + struct request *req = blk_mq_tag_to_rq( + ub->tag_set.tags[ubq->q_id], tag); + + if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req))) + __ublk_fail_req(ub, &ubq->ios[tag], req); } } @@ -1965,6 +2525,9 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) __ublk_fail_req(ub, io, io->req); } + + if (ublk_support_batch_io(ubq)) + ublk_abort_batch_queue(ub, ubq); } static void ublk_start_cancel(struct ublk_device *ub) @@ -2028,19 +2591,69 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } -/* - * The ublk char device won't be closed when calling cancel fn, so both - * ublk device and queue are guaranteed to be live - * - * Two-stage cancel: - * - * - make every active uring_cmd done in ->cancel_fn() - * - * - aborting inflight ublk IO requests in ublk char device release handler, - * which depends on 1st stage because device can only be closed iff all - * uring_cmd are done - * - * Do _not_ try to acquire ub->mutex before all inflight requests are +static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, + struct ublk_batch_fetch_cmd *fcmd, + unsigned int issue_flags) +{ + bool done; + + spin_lock(&ubq->evts_lock); + done = (READ_ONCE(ubq->active_fcmd) != fcmd); + if (done) + list_del(&fcmd->node); + spin_unlock(&ubq->evts_lock); + + if (done) { + io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags); + ublk_batch_free_fcmd(fcmd); + } +} + +static void ublk_batch_cancel_queue(struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + LIST_HEAD(fcmd_list); + + spin_lock(&ubq->evts_lock); + ubq->force_abort = true; + list_splice_init(&ubq->fcmd_head, &fcmd_list); + fcmd = READ_ONCE(ubq->active_fcmd); + if (fcmd) + list_move(&fcmd->node, &ubq->fcmd_head); + spin_unlock(&ubq->evts_lock); + + while (!list_empty(&fcmd_list)) { + fcmd = list_first_entry(&fcmd_list, + struct ublk_batch_fetch_cmd, node); + ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED); + } +} + +static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_queue *ubq = pdu->ubq; + + ublk_start_cancel(ubq->dev); + + ublk_batch_cancel_cmd(ubq, fcmd, issue_flags); +} + +/* + * The ublk char device won't be closed when calling cancel fn, so both + * ublk device and queue are guaranteed to be live + * + * Two-stage cancel: + * + * - make every active uring_cmd done in ->cancel_fn() + * + * - aborting inflight ublk IO requests in ublk char device release handler, + * which depends on 1st stage because device can only be closed iff all + * uring_cmd are done + * + * Do _not_ try to acquire ub->mutex before all inflight requests are * aborted, otherwise deadlock may be caused. */ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, @@ -2068,17 +2681,25 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } -static inline bool ublk_dev_ready(const struct ublk_device *ub) +static inline bool ublk_queue_ready(const struct ublk_queue *ubq) { - u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth; + return ubq->nr_io_ready == ubq->q_depth; +} - return ub->nr_io_ready == total; +static inline bool ublk_dev_ready(const struct ublk_device *ub) +{ + return ub->nr_queue_ready == ub->dev_info.nr_hw_queues; } static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; + if (ublk_support_batch_io(ubq)) { + ublk_batch_cancel_queue(ubq); + return; + } + for (i = 0; i < ubq->q_depth; i++) ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } @@ -2176,37 +2797,52 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_cancel_dev(ub); } -/* reset ublk io_uring queue & io flags */ -static void ublk_reset_io_flags(struct ublk_device *ub) +/* reset per-queue io flags */ +static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) { - int i, j; - - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); + int j; - /* UBLK_IO_FLAG_CANCELED can be cleared now */ - spin_lock(&ubq->cancel_lock); - for (j = 0; j < ubq->q_depth; j++) - ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; - spin_unlock(&ubq->cancel_lock); - ubq->fail_io = false; - } - mutex_lock(&ub->cancel_mutex); - ublk_set_canceling(ub, false); - mutex_unlock(&ub->cancel_mutex); + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + for (j = 0; j < ubq->q_depth; j++) + ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + ubq->fail_io = false; + ubq->canceling = false; } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub) +static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) __must_hold(&ub->mutex) { + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) ub->unprivileged_daemons = true; - ub->nr_io_ready++; + ubq->nr_io_ready++; + + /* Check if this specific queue is now fully ready */ + if (ublk_queue_ready(ubq)) { + ub->nr_queue_ready++; + + /* + * Reset queue flags as soon as this queue is ready. + * This clears the canceling flag, allowing batch FETCH commands + * to succeed during recovery without waiting for all queues. + */ + ublk_queue_reset_io_flags(ubq); + } + + /* Check if all queues are ready */ if (ublk_dev_ready(ub)) { - /* now we are ready for handling ublk io request */ - ublk_reset_io_flags(ub); + /* + * All queues ready - clear device-level canceling flag + * and complete the recovery/initialization. + */ + mutex_lock(&ub->cancel_mutex); + ub->canceling = false; + mutex_unlock(&ub->cancel_mutex); complete_all(&ub->completion); } } @@ -2239,7 +2875,7 @@ static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd return 0; } -static int ublk_handle_auto_buf_reg(struct ublk_io *io, +static void ublk_clear_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd, u16 *buf_idx) { @@ -2259,7 +2895,13 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) *buf_idx = io->buf.auto_reg.index; } +} +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) +{ + ublk_clear_auto_buf_reg(io, cmd, buf_idx); return ublk_set_auto_buf_reg(io, cmd); } @@ -2404,7 +3046,7 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) } static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io) + struct ublk_io *io, u16 q_id) { /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ if (ublk_dev_ready(ub)) @@ -2418,14 +3060,17 @@ static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, ublk_fill_io_cmd(io, cmd); - WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub); + if (ublk_dev_support_batch_io(ub)) + WRITE_ONCE(io->task, NULL); + else + WRITE_ONCE(io->task, get_task_struct(current)); + ublk_mark_io_ready(ub, q_id); return 0; } static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io, __u64 buf_addr) + struct ublk_io *io, __u64 buf_addr, u16 q_id) { int ret; @@ -2435,7 +3080,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, * FETCH, so it is fine even for IO_URING_F_NONBLOCK. */ mutex_lock(&ub->mutex); - ret = __ublk_fetch(cmd, ub, io); + ret = __ublk_fetch(cmd, ub, io, q_id); if (!ret) ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); mutex_unlock(&ub->mutex); @@ -2541,7 +3186,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, ret = ublk_check_fetch_buf(ub, addr); if (ret) goto out; - ret = ublk_fetch(cmd, ub, io, addr); + ret = ublk_fetch(cmd, ub, io, addr, q_id); if (ret) goto out; @@ -2594,7 +3239,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) - __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); if (ret) goto out; @@ -2679,6 +3324,472 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) + return *(const __u64 *)(buf + sizeof(*elem)); + return 0; +} + +static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) + return *(const __u64 *)(buf + sizeof(*elem) + + 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)); + return -1; +} + +static struct ublk_auto_buf_reg +ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + struct ublk_auto_buf_reg reg = { + .index = elem->buf_index, + .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ? + UBLK_AUTO_BUF_REG_FALLBACK : 0, + }; + + return reg; +} + +/* + * 48 can hold any type of buffer element(8, 16 and 24 bytes) because + * it is the least common multiple(LCM) of 8, 16 and 24 + */ +#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10) +struct ublk_batch_io_iter { + void __user *uaddr; + unsigned done, total; + unsigned char elem_bytes; + /* copy to this buffer from user space */ + unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ]; +}; + +static inline int +__ublk_walk_cmd_buf(struct ublk_queue *ubq, + struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + unsigned bytes, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < bytes; i += iter->elem_bytes) { + const struct ublk_elem_header *elem = + (const struct ublk_elem_header *)&iter->buf[i]; + + if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) { + ret = -EINVAL; + break; + } + + ret = cb(ubq, data, elem); + if (unlikely(ret)) + break; + } + + iter->done += i; + return ret; +} + +static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + int ret = 0; + + while (iter->done < iter->total) { + unsigned int len = min(sizeof(iter->buf), iter->total - iter->done); + + if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) { + pr_warn("ublk%d: read batch cmd buffer failed\n", + data->ub->dev_info.dev_id); + return -EFAULT; + } + + ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb); + if (ret) + return ret; + } + return 0; +} + +static int ublk_batch_unprep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + + /* + * If queue was ready before this decrement, it won't be anymore, + * so we need to decrement the queue ready count too. + */ + if (ublk_queue_ready(ubq)) + data->ub->nr_queue_ready--; + ubq->nr_io_ready--; + + ublk_io_lock(io); + io->flags = 0; + ublk_io_unlock(io); + return 0; +} + +static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data) +{ + int ret; + + /* Re-process only what we've already processed, starting from beginning */ + iter->total = iter->done; + iter->done = 0; + + ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io); + WARN_ON_ONCE(ret); +} + +static int ublk_batch_prep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + union ublk_io_buf buf = { 0 }; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + else if (ublk_dev_need_map_io(data->ub)) { + buf.addr = ublk_batch_buf_addr(uc, elem); + + ret = ublk_check_fetch_buf(data->ub, buf.addr); + if (ret) + return ret; + } + + ublk_io_lock(io); + ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id); + if (!ret) + io->buf = buf; + ublk_io_unlock(io); + + return ret; +} + +static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + mutex_lock(&data->ub->mutex); + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io); + + if (ret && iter.done) + ublk_batch_revert_prep_cmd(&iter, data); + mutex_unlock(&data->ub->mutex); + return ret; +} + +static int ublk_batch_commit_io_check(const struct ublk_queue *ubq, + struct ublk_io *io, + union ublk_io_buf *buf) +{ + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EBUSY; + + /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */ + if (ublk_need_map_io(ubq) && !buf->addr) + return -EINVAL; + return 0; +} + +static int ublk_batch_commit_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + u16 buf_idx = UBLK_INVALID_BUF_IDX; + union ublk_io_buf buf = { 0 }; + struct request *req = NULL; + bool auto_reg = false; + bool compl = false; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) { + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + auto_reg = true; + } else if (ublk_dev_need_map_io(data->ub)) + buf.addr = ublk_batch_buf_addr(uc, elem); + + ublk_io_lock(io); + ret = ublk_batch_commit_io_check(ubq, io, &buf); + if (!ret) { + io->res = elem->result; + io->buf = buf; + req = ublk_fill_io_cmd(io, data->cmd); + + if (auto_reg) + ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx); + compl = ublk_need_complete_req(data->ub, io); + } + ublk_io_unlock(io); + + if (unlikely(ret)) { + pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n", + __func__, data->ub->dev_info.dev_id, ubq->q_id, + elem->tag, ret); + return ret; + } + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ublk_batch_zone_lba(uc, elem); + if (compl) + __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob); + return 0; +} + +static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + DEFINE_IO_COMP_BATCH(iob); + int ret; + + data->iob = &iob; + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); + + if (iob.complete) + iob.complete(&iob); + + return iter.done == 0 ? ret : iter.done; +} + +static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) +{ + unsigned elem_bytes = sizeof(struct ublk_elem_header); + + if (uc->flags & ~UBLK_BATCH_F_ALL) + return -EINVAL; + + /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */ + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)) + return -EINVAL; + + elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) + + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0); + if (uc->elem_bytes != elem_bytes) + return -EINVAL; + return 0; +} + +static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (uc->nr_elem > data->ub->dev_info.queue_depth) + return -E2BIG; + + if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) && + !ublk_dev_is_zoned(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) && + !ublk_dev_need_map_io(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + !ublk_dev_support_auto_buf_reg(data->ub)) + return -EINVAL; + + return ublk_check_batch_cmd_flags(uc); +} + +static int ublk_batch_attach(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + struct ublk_batch_fetch_cmd *new_fcmd = NULL; + bool free = false; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd); + + spin_lock(&ubq->evts_lock); + if (unlikely(ubq->force_abort || ubq->canceling)) { + free = true; + } else { + list_add_tail(&fcmd->node, &ubq->fcmd_head); + new_fcmd = __ublk_acquire_fcmd(ubq); + } + spin_unlock(&ubq->evts_lock); + + if (unlikely(free)) { + ublk_batch_free_fcmd(fcmd); + return -ENODEV; + } + + pdu->ubq = ubq; + pdu->fcmd = fcmd; + io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags); + + if (!new_fcmd) + goto out; + + /* + * If the two fetch commands are originated from same io_ring_ctx, + * run batch dispatch directly. Otherwise, schedule task work for + * doing it. + */ + if (io_uring_cmd_ctx_handle(new_fcmd->cmd) == + io_uring_cmd_ctx_handle(fcmd->cmd)) { + data->cmd = new_fcmd->cmd; + ublk_batch_dispatch(ubq, data, new_fcmd); + } else { + io_uring_cmd_complete_in_task(new_fcmd->cmd, + ublk_batch_tw_cb); + } +out: + return -EIOCBQUEUED; +} + +static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd); + + if (!fcmd) + return -ENOMEM; + + return ublk_batch_attach(ubq, data, fcmd); +} + +static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) + return -EINVAL; + + if (uc->elem_bytes != sizeof(__u16)) + return -EINVAL; + + if (uc->flags != 0) + return -EINVAL; + + return 0; +} + +static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + unsigned tag = READ_ONCE(ub_cmd->tag); + unsigned q_id = READ_ONCE(ub_cmd->q_id); + unsigned index = READ_ONCE(ub_cmd->addr); + struct ublk_queue *ubq; + struct ublk_io *io; + + if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, index, issue_flags); + + if (q_id >= ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (tag >= ub->dev_info.queue_depth) + return -EINVAL; + + if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF) + return -EOPNOTSUPP; + + ubq = ublk_get_queue(ub, q_id); + io = &ubq->ios[tag]; + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); +} + +static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + struct ublk_batch_io_data data = { + .ub = ub, + .cmd = cmd, + .header = (struct ublk_batch_io) { + .q_id = READ_ONCE(uc->q_id), + .flags = READ_ONCE(uc->flags), + .nr_elem = READ_ONCE(uc->nr_elem), + .elem_bytes = READ_ONCE(uc->elem_bytes), + }, + .issue_flags = issue_flags, + }; + u32 cmd_op = cmd->cmd_op; + int ret = -EINVAL; + + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_batch_cancel_fn(cmd, issue_flags); + return 0; + } + + switch (cmd_op) { + case UBLK_U_IO_PREP_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_prep_cmd(&data); + break; + case UBLK_U_IO_COMMIT_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_commit_cmd(&data); + break; + case UBLK_U_IO_FETCH_IO_CMDS: + ret = ublk_validate_batch_fetch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_fetch_cmd(&data); + break; + default: + ret = ublk_handle_non_batch_cmd(cmd, issue_flags); + break; + } +out: + return ret; +} + static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { @@ -2798,14 +3909,20 @@ static const struct file_operations ublk_ch_fops = { .mmap = ublk_ch_mmap, }; -static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +static const struct file_operations ublk_ch_batch_io_fops = { + .owner = THIS_MODULE, + .open = ublk_ch_open, + .release = ublk_ch_release, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, + .uring_cmd = ublk_ch_batch_io_uring_cmd, + .mmap = ublk_ch_mmap, +}; + +static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq) { - struct ublk_queue *ubq = ub->queues[q_id]; int size, i; - if (!ubq) - return; - size = ublk_queue_cmd_buf_size(ub); for (i = 0; i < ubq->q_depth; i++) { @@ -2819,7 +3936,20 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + if (ublk_dev_support_batch_io(ub)) + ublk_io_evts_deinit(ubq); + kvfree(ubq); +} + +static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ub->queues[q_id]; + + if (!ubq) + return; + + __ublk_deinit_queue(ub, ubq); ub->queues[q_id] = NULL; } @@ -2843,7 +3973,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq; struct page *page; int numa_node; - int size; + int size, i, ret; /* Determine NUMA node based on queue's CPU affinity */ numa_node = ublk_get_queue_numa_node(ub, q_id); @@ -2868,9 +3998,22 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) } ubq->io_cmd_buf = page_address(page); + for (i = 0; i < ubq->q_depth; i++) + spin_lock_init(&ubq->ios[i].lock); + + if (ublk_dev_support_batch_io(ub)) { + ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node); + if (ret) + goto fail; + INIT_LIST_HEAD(&ubq->fcmd_head); + } ub->queues[q_id] = ubq; ubq->dev = ub; + return 0; +fail: + __ublk_deinit_queue(ub, ubq); + return ret; } static void ublk_deinit_queues(struct ublk_device *ub) @@ -2958,7 +4101,10 @@ static int ublk_add_chdev(struct ublk_device *ub) if (ret) goto fail; - cdev_init(&ub->cdev, &ublk_ch_fops); + if (ublk_dev_support_batch_io(ub)) + cdev_init(&ub->cdev, &ublk_ch_batch_io_fops); + else + cdev_init(&ub->cdev, &ublk_ch_fops); ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; @@ -2982,7 +4128,10 @@ static void ublk_align_max_io_size(struct ublk_device *ub) static int ublk_add_tag_set(struct ublk_device *ub) { - ub->tag_set.ops = &ublk_mq_ops; + if (ublk_dev_support_batch_io(ub)) + ub->tag_set.ops = &ublk_batch_mq_ops; + else + ub->tag_set.ops = &ublk_mq_ops; ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; @@ -3108,6 +4257,11 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, return -EINVAL; mutex_lock(&ub->mutex); + /* device may become not ready in case of F_BATCH */ + if (!ublk_dev_ready(ub)) { + ret = -EINVAL; + goto out_unlock; + } if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { ret = -EEXIST; @@ -3353,11 +4507,19 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) UBLK_F_BUF_REG_OFF_DAEMON | UBLK_F_SAFE_STOP_DEV; + /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON; + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* UBLK_F_BATCH_IO doesn't support GET_DATA */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for * returning write_append_lba, which is only allowed in case of @@ -3732,6 +4894,13 @@ static int ublk_wait_for_idle_io(struct ublk_device *ub, unsigned int elapsed = 0; int ret; + /* + * For UBLK_F_BATCH_IO ublk server can get notified with existing + * or new fetch command, so needn't wait any more + */ + if (ublk_dev_support_batch_io(ub)) + return 0; + while (elapsed < timeout_ms && !signal_pending(current)) { unsigned int queues_cancelable = 0; int i; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 90f47da4f435..743d31491387 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -104,6 +104,30 @@ #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) +/* + * return 0 if the command is run successfully, otherwise failure code + * is returned + */ +#define UBLK_U_IO_PREP_IO_CMDS \ + _IOWR('u', 0x25, struct ublk_batch_io) +/* + * If failure code is returned, nothing in the command buffer is handled. + * Otherwise, the returned value means how many bytes in command buffer + * are handled actually, then number of handled IOs can be calculated with + * `elem_bytes` for each IO. IOs in the remained bytes are not committed, + * userspace has to check return value for dealing with partial committing + * correctly. + */ +#define UBLK_U_IO_COMMIT_IO_CMDS \ + _IOWR('u', 0x26, struct ublk_batch_io) + +/* + * Fetch io commands to provided buffer in multishot style, + * `IORING_URING_CMD_MULTISHOT` is required for this command. + */ +#define UBLK_U_IO_FETCH_IO_CMDS \ + _IOWR('u', 0x27, struct ublk_batch_io) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 @@ -316,6 +340,21 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) +/* + * Support the following commands for delivering & committing io command + * in batch. + * + * - UBLK_U_IO_PREP_IO_CMDS + * - UBLK_U_IO_COMMIT_IO_CMDS + * - UBLK_U_IO_FETCH_IO_CMDS + * - UBLK_U_IO_REGISTER_IO_BUF + * - UBLK_U_IO_UNREGISTER_IO_BUF + * + * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and + * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature. + */ +#define UBLK_F_BATCH_IO (1ULL << 15) + /* * ublk device supports requests with integrity/metadata buffer. * Requires UBLK_F_USER_COPY. @@ -544,6 +583,51 @@ struct ublksrv_io_cmd { }; }; +struct ublk_elem_header { + __u16 tag; /* IO tag */ + + /* + * Buffer index for incoming io command, only valid iff + * UBLK_F_AUTO_BUF_REG is set + */ + __u16 buf_index; + __s32 result; /* I/O completion result (commit only) */ +}; + +/* + * uring_cmd buffer structure for batch commands + * + * buffer includes multiple elements, which number is specified by + * `nr_elem`. Each element buffer is organized in the following order: + * + * struct ublk_elem_buffer { + * // Mandatory fields (8 bytes) + * struct ublk_elem_header header; + * + * // Optional fields (8 bytes each, included based on flags) + * + * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data + * // between ublk request and ublk server buffer + * __u64 buf_addr; + * + * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA) + * __u64 zone_lba; + * } + * + * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS` + */ +struct ublk_batch_io { + __u16 q_id; +#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0) +#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1) +#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2) + __u16 flags; + __u16 nr_elem; + __u8 elem_bytes; + __u8 reserved; + __u64 reserved2; +}; + struct ublk_param_basic { #define UBLK_ATTR_READ_ONLY (1 << 0) #define UBLK_ATTR_ROTATIONAL (1 << 1) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 3a2498089b15..e39a6f871fcc 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -25,6 +25,10 @@ TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_batch_01.sh +TEST_PROGS += test_batch_02.sh +TEST_PROGS += test_batch_03.sh + TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh @@ -51,6 +55,10 @@ TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_PROGS += test_stress_08.sh +TEST_PROGS += test_stress_09.sh + +TEST_FILES := settings TEST_GEN_PROGS_EXTENDED = kublk metadata_size STANDALONE_UTILS := metadata_size.c diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 000000000000..a54025b00917 --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,607 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + if (t->commit_buf) { + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + + munlock(t->commit_buf, total); + free(t->commit_buf); + } + allocator_deinit(&t->commit_buf_alloc); + free(t->commit); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) + t->commit[j++].q_id = i; + } + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + + /* lock commit buffer pages for fast access */ + if (mlock(t->commit_buf, total)) + ublk_err("%s: can't lock commit buffer %s\n", __func__, + strerror(errno)); + + return 0; + +fail: + free_batch_commit_buf(t); + return ret; +} + +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->q_map[i]; + + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2 * t->nr_queues; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_fetch_bufs; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } + free(t->fetch); +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + int ret; + + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); + free_batch_fetch_buf(t); +} + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + /* Use plain user buffer instead of fixed buffer */ + cmd->flags |= t->cmd_flags; +} + +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t) +{ + int i; + int j = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) { + struct ublk_queue *q = &t->dev->q[i]; + + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + t->cmd_inflight--; + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->cmd_inflight--; + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + t->cmd_inflight--; + ublk_batch_queue_fetch(t, q, buf_idx); + } +} + +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = cb->done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, cb->buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; + struct ublk_io *io = &q->ios[tag]; + + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + + ublk_assert(q->q_id == cb->q_id); + + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} + +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues) +{ + int i, j; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < nthreads; j++) { + for (i = 0; i < queues; i++) { + printf("%03u ", q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index d9873d4d50d0..530f9877c9dd 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -16,7 +16,7 @@ int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c3ce5ff72422..228af2580ac6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, @@ -39,6 +39,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + unsigned short buf_index = ublk_io_buf_idx(t, q, tag); if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); @@ -62,7 +63,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, len, offset); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_index; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -71,7 +72,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -79,11 +80,11 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, len, offset); - sqe[1]->buf_index = tag; + sqe[1]->buf_index = buf_index; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3472ce7426ba..2da37557e1a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -435,6 +435,8 @@ static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -453,6 +455,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; @@ -517,6 +520,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth * 2; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -531,15 +538,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { @@ -605,16 +630,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -730,7 +756,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); @@ -819,13 +845,15 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, unsigned tag = user_data_to_tag(cqe->user_data); struct ublk_io *io = &q->ios[tag]; + t->cmd_inflight--; + if (!fetch) { t->state |= UBLKS_T_STOPPING; io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (ublk_queue_use_user_copy(q)) ublk_user_copy(io, UBLK_IO_OP_WRITE); @@ -853,28 +881,30 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } - t->cmd_inflight--; - - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -906,7 +936,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), @@ -922,6 +958,7 @@ struct ublk_thread_info { sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; }; static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) @@ -931,6 +968,26 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) info->dev->dev_info.dev_id, info->idx); } +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->q_map[i] == 0) + continue; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret >= 0); + } +} + static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) { struct ublk_thread t = { @@ -940,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf int dev_id = info->dev->dev_info.dev_id; int ret; + /* Copy per-thread queue mapping into thread-local variable */ + if (info->q_thread_map) + memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", @@ -951,8 +1012,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t.idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(&t); + if (!ublk_thread_batch_io(&t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + } else { + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t); + } + do { if (ublk_process_io(&t) < 0) break; @@ -1024,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; void *thread_ret; sem_t ready; int ret, i; @@ -1043,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ret) return ret; + if (ublk_dev_batch_io(dev)) { + q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); + if (!q_thread_map) { + ret = -ENOMEM; + goto fail; + } + ublk_batch_setup_map(q_thread_map, dev->nthreads, + dinfo->nr_hw_queues); + } + if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; if (ctx->no_ublk_fixed_fd) @@ -1066,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].idx = i; tinfo[i].ready = &ready; tinfo[i].extra_flags = extra_flags; + tinfo[i].q_thread_map = q_thread_map; /* * If threads are not tied 1:1 to queues, setting thread @@ -1085,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); free(affinity_buf); + free(q_thread_map); /* everything is fine now, start us */ if (ctx->recovery) @@ -1253,7 +1333,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1532,7 +1613,8 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), - FEAT_NAME(UBLK_F_SAFE_STOP_DEV) + FEAT_NAME(UBLK_F_SAFE_STOP_DEV), + FEAT_NAME(UBLK_F_BATCH_IO), }; struct ublk_dev *dev; __u64 features = 0; @@ -1630,6 +1712,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); + printf("\t[--batch|-b]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1702,6 +1785,7 @@ int main(int argc, char *argv[]) { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1724,12 +1808,15 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", longopts, &option_idx)) != -1) { switch (opt) { case 'a': ctx.all = 1; break; + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'n': ctx.dev_id = strtol(optarg, NULL, 10); break; @@ -1834,6 +1921,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && @@ -1868,6 +1960,13 @@ int main(int argc, char *argv[]) return -EINVAL; } + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index cb757fd9bf9d..ca97deb5e208 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -150,7 +150,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -172,24 +173,76 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; +}; + +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; }; struct ublk_thread { + /* Thread-local copy of queue-to-thread mapping for this thread */ + unsigned char q_map[UBLK_MAX_QUEUES]; + struct ublk_dev *dev; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; unsigned int cmd_inflight; unsigned int io_inflight; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; + struct batch_commit_buf *commit; + /* FETCH_IO_CMDS buffer */ + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; + struct io_uring ring; }; @@ -210,6 +263,27 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, struct ublk_params *params) { @@ -260,9 +334,9 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, { /* we only have 7 bits to encode q_id */ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } @@ -393,33 +467,22 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } -static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) -{ - return &q->ios[tag]; -} +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) { - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); + return q->ios[tag].buf_index; } -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) +static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } + return &q->ios[tag]; } static inline int ublk_completed_tgt_io(struct ublk_thread *t, @@ -457,6 +520,84 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->q_map[q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 3aa162f08476..7656888f4149 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -44,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -61,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -86,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -137,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings new file mode 100644 index 000000000000..682a40f1c8e6 --- /dev/null +++ b/tools/testing/selftests/ublk/settings @@ -0,0 +1 @@ +timeout=150 diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 2be1c36438e7..dca819f5366e 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = io->buf_addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } @@ -322,7 +323,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh new file mode 100755 index 000000000000..9fa9fff5c62f --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_01" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh new file mode 100755 index 000000000000..b477f91359e1 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_02" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh new file mode 100755 index 000000000000..13a2b3d3a1b9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_03" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index baf5b156193d..be2292822bbe 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -26,6 +26,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 & ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 7b5083afc02a..9b7f71c16d82 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -30,6 +30,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 -z & ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh new file mode 100755 index 000000000000..190db0b4f2ad --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_06" +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh new file mode 100755 index 000000000000..1b6bdb31da03 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_07" +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e..aab522f26167 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) @@ -43,6 +97,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +107,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +118,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif