Skip to content

Commit ab03a61

Browse files
ps-ushankaraxboe
authored andcommitted
ublk: have a per-io daemon instead of a per-queue daemon
Currently, ublk_drv associates to each hardware queue (hctx) a unique task (called the queue's ubq_daemon) which is allowed to issue COMMIT_AND_FETCH commands against the hctx. If any other task attempts to do so, the command fails immediately with EINVAL. When considered together with the block layer architecture, the result is that for each CPU C on the system, there is a unique ublk server thread which is allowed to handle I/O submitted on CPU C. This can lead to suboptimal performance under imbalanced load generation. For an extreme example, suppose all the load is generated on CPUs mapping to a single ublk server thread. Then that thread may be fully utilized and become the bottleneck in the system, while other ublk server threads are totally idle. This issue can also be addressed directly in the ublk server without kernel support by having threads dequeue I/Os and pass them around to ensure even load. But this solution requires inter-thread communication at least twice for each I/O (submission and completion), which is generally a bad pattern for performance. The problem gets even worse with zero copy, as more inter-thread communication would be required to have the buffer register/unregister calls to come from the correct thread. Therefore, address this issue in ublk_drv by allowing each I/O to have its own daemon task. Two I/Os in the same queue are now allowed to be serviced by different daemon tasks - this was not possible before. Imbalanced load can then be balanced across all ublk server threads by having the ublk server threads issue FETCH_REQs in a round-robin manner. As a small toy example, consider a system with a single ublk device having 2 queues, each of depth 4. A ublk server having 4 threads could issue its FETCH_REQs against this device as follows (where each entry is the qid,tag pair that the FETCH_REQ targets): ublk server thread: T0 T1 T2 T3 0,0 0,1 0,2 0,3 1,3 1,0 1,1 1,2 This setup allows for load that is concentrated on one hctx/ublk_queue to be spread out across all ublk server threads, alleviating the issue described above. Add the new UBLK_F_PER_IO_DAEMON feature to ublk_drv, which ublk servers can use to essentially test for the presence of this change and tailor their behavior accordingly. Signed-off-by: Uday Shankar <[email protected]> Reviewed-by: Caleb Sander Mateos <[email protected]> Reviewed-by: Ming Lei <[email protected]> Reviewed-by: Jens Axboe <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent f4d22ac commit ab03a61

File tree

2 files changed

+65
-55
lines changed

2 files changed

+65
-55
lines changed

drivers/block/ublk_drv.c

Lines changed: 56 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@
6969
| UBLK_F_USER_RECOVERY_FAIL_IO \
7070
| UBLK_F_UPDATE_SIZE \
7171
| UBLK_F_AUTO_BUF_REG \
72-
| UBLK_F_QUIESCE)
72+
| UBLK_F_QUIESCE \
73+
| UBLK_F_PER_IO_DAEMON)
7374

7475
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
7576
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -166,18 +167,18 @@ struct ublk_io {
166167
/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
167168
struct request *req;
168169
};
170+
171+
struct task_struct *task;
169172
};
170173

171174
struct ublk_queue {
172175
int q_id;
173176
int q_depth;
174177

175178
unsigned long flags;
176-
struct task_struct *ubq_daemon;
177179
struct ublksrv_io_desc *io_cmd_buf;
178180

179181
bool force_abort;
180-
bool timeout;
181182
bool canceling;
182183
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
183184
unsigned short nr_io_ready; /* how many ios setup */
@@ -1099,11 +1100,6 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
10991100
return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
11001101
}
11011102

1102-
static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
1103-
{
1104-
return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING;
1105-
}
1106-
11071103
/* todo: handle partial completion */
11081104
static inline void __ublk_complete_rq(struct request *req)
11091105
{
@@ -1275,13 +1271,13 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
12751271
/*
12761272
* Task is exiting if either:
12771273
*
1278-
* (1) current != ubq_daemon.
1274+
* (1) current != io->task.
12791275
* io_uring_cmd_complete_in_task() tries to run task_work
1280-
* in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
1276+
* in a workqueue if cmd's task is PF_EXITING.
12811277
*
12821278
* (2) current->flags & PF_EXITING.
12831279
*/
1284-
if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
1280+
if (unlikely(current != io->task || current->flags & PF_EXITING)) {
12851281
__ublk_abort_rq(ubq, req);
12861282
return;
12871283
}
@@ -1330,38 +1326,33 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
13301326
{
13311327
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
13321328
struct request *rq = pdu->req_list;
1333-
struct ublk_queue *ubq = pdu->ubq;
13341329
struct request *next;
13351330

13361331
do {
13371332
next = rq->rq_next;
13381333
rq->rq_next = NULL;
1339-
ublk_dispatch_req(ubq, rq, issue_flags);
1334+
ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
13401335
rq = next;
13411336
} while (rq);
13421337
}
13431338

1344-
static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
1339+
static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
13451340
{
1346-
struct request *rq = rq_list_peek(l);
1347-
struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
1341+
struct io_uring_cmd *cmd = io->cmd;
13481342
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
13491343

1350-
pdu->req_list = rq;
1344+
pdu->req_list = rq_list_peek(l);
13511345
rq_list_init(l);
13521346
io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
13531347
}
13541348

13551349
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
13561350
{
13571351
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1352+
struct ublk_io *io = &ubq->ios[rq->tag];
13581353

13591354
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
1360-
if (!ubq->timeout) {
1361-
send_sig(SIGKILL, ubq->ubq_daemon, 0);
1362-
ubq->timeout = true;
1363-
}
1364-
1355+
send_sig(SIGKILL, io->task, 0);
13651356
return BLK_EH_DONE;
13661357
}
13671358

@@ -1429,24 +1420,25 @@ static void ublk_queue_rqs(struct rq_list *rqlist)
14291420
{
14301421
struct rq_list requeue_list = { };
14311422
struct rq_list submit_list = { };
1432-
struct ublk_queue *ubq = NULL;
1423+
struct ublk_io *io = NULL;
14331424
struct request *req;
14341425

14351426
while ((req = rq_list_pop(rqlist))) {
14361427
struct ublk_queue *this_q = req->mq_hctx->driver_data;
1428+
struct ublk_io *this_io = &this_q->ios[req->tag];
14371429

1438-
if (ubq && ubq != this_q && !rq_list_empty(&submit_list))
1439-
ublk_queue_cmd_list(ubq, &submit_list);
1440-
ubq = this_q;
1430+
if (io && io->task != this_io->task && !rq_list_empty(&submit_list))
1431+
ublk_queue_cmd_list(io, &submit_list);
1432+
io = this_io;
14411433

1442-
if (ublk_prep_req(ubq, req, true) == BLK_STS_OK)
1434+
if (ublk_prep_req(this_q, req, true) == BLK_STS_OK)
14431435
rq_list_add_tail(&submit_list, req);
14441436
else
14451437
rq_list_add_tail(&requeue_list, req);
14461438
}
14471439

1448-
if (ubq && !rq_list_empty(&submit_list))
1449-
ublk_queue_cmd_list(ubq, &submit_list);
1440+
if (!rq_list_empty(&submit_list))
1441+
ublk_queue_cmd_list(io, &submit_list);
14501442
*rqlist = requeue_list;
14511443
}
14521444

@@ -1474,17 +1466,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
14741466
/* All old ioucmds have to be completed */
14751467
ubq->nr_io_ready = 0;
14761468

1477-
/*
1478-
* old daemon is PF_EXITING, put it now
1479-
*
1480-
* It could be NULL in case of closing one quisced device.
1481-
*/
1482-
if (ubq->ubq_daemon)
1483-
put_task_struct(ubq->ubq_daemon);
1484-
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
1485-
ubq->ubq_daemon = NULL;
1486-
ubq->timeout = false;
1487-
14881469
for (i = 0; i < ubq->q_depth; i++) {
14891470
struct ublk_io *io = &ubq->ios[i];
14901471

@@ -1495,6 +1476,17 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
14951476
io->flags &= UBLK_IO_FLAG_CANCELED;
14961477
io->cmd = NULL;
14971478
io->addr = 0;
1479+
1480+
/*
1481+
* old task is PF_EXITING, put it now
1482+
*
1483+
* It could be NULL in case of closing one quiesced
1484+
* device.
1485+
*/
1486+
if (io->task) {
1487+
put_task_struct(io->task);
1488+
io->task = NULL;
1489+
}
14981490
}
14991491
}
15001492

@@ -1516,7 +1508,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
15161508
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
15171509
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
15181510

1519-
/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
1511+
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
15201512
ub->mm = NULL;
15211513
ub->nr_queues_ready = 0;
15221514
ub->nr_privileged_daemon = 0;
@@ -1783,6 +1775,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
17831775
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
17841776
struct ublk_queue *ubq = pdu->ubq;
17851777
struct task_struct *task;
1778+
struct ublk_io *io;
17861779

17871780
if (WARN_ON_ONCE(!ubq))
17881781
return;
@@ -1791,13 +1784,14 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
17911784
return;
17921785

17931786
task = io_uring_cmd_get_task(cmd);
1794-
if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
1787+
io = &ubq->ios[pdu->tag];
1788+
if (WARN_ON_ONCE(task && task != io->task))
17951789
return;
17961790

17971791
if (!ubq->canceling)
17981792
ublk_start_cancel(ubq);
17991793

1800-
WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd);
1794+
WARN_ON_ONCE(io->cmd != cmd);
18011795
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
18021796
}
18031797

@@ -1930,8 +1924,6 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
19301924
{
19311925
ubq->nr_io_ready++;
19321926
if (ublk_queue_ready(ubq)) {
1933-
ubq->ubq_daemon = current;
1934-
get_task_struct(ubq->ubq_daemon);
19351927
ub->nr_queues_ready++;
19361928

19371929
if (capable(CAP_SYS_ADMIN))
@@ -2084,6 +2076,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
20842076
}
20852077

20862078
ublk_fill_io_cmd(io, cmd, buf_addr);
2079+
WRITE_ONCE(io->task, get_task_struct(current));
20872080
ublk_mark_io_ready(ub, ubq);
20882081
out:
20892082
mutex_unlock(&ub->mutex);
@@ -2179,6 +2172,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
21792172
const struct ublksrv_io_cmd *ub_cmd)
21802173
{
21812174
struct ublk_device *ub = cmd->file->private_data;
2175+
struct task_struct *task;
21822176
struct ublk_queue *ubq;
21832177
struct ublk_io *io;
21842178
u32 cmd_op = cmd->cmd_op;
@@ -2193,13 +2187,14 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
21932187
goto out;
21942188

21952189
ubq = ublk_get_queue(ub, ub_cmd->q_id);
2196-
if (ubq->ubq_daemon && ubq->ubq_daemon != current)
2197-
goto out;
21982190

21992191
if (tag >= ubq->q_depth)
22002192
goto out;
22012193

22022194
io = &ubq->ios[tag];
2195+
task = READ_ONCE(io->task);
2196+
if (task && task != current)
2197+
goto out;
22032198

22042199
/* there is pending io cmd, something must be wrong */
22052200
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
@@ -2449,9 +2444,14 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
24492444
{
24502445
int size = ublk_queue_cmd_buf_size(ub, q_id);
24512446
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2447+
int i;
2448+
2449+
for (i = 0; i < ubq->q_depth; i++) {
2450+
struct ublk_io *io = &ubq->ios[i];
2451+
if (io->task)
2452+
put_task_struct(io->task);
2453+
}
24522454

2453-
if (ubq->ubq_daemon)
2454-
put_task_struct(ubq->ubq_daemon);
24552455
if (ubq->io_cmd_buf)
24562456
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
24572457
}
@@ -2923,7 +2923,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
29232923
ub->dev_info.flags &= UBLK_F_ALL;
29242924

29252925
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
2926-
UBLK_F_URING_CMD_COMP_IN_TASK;
2926+
UBLK_F_URING_CMD_COMP_IN_TASK |
2927+
UBLK_F_PER_IO_DAEMON;
29272928

29282929
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
29292930
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
@@ -3188,14 +3189,14 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
31883189
int ublksrv_pid = (int)header->data[0];
31893190
int ret = -EINVAL;
31903191

3191-
pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
3192-
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
3193-
/* wait until new ubq_daemon sending all FETCH_REQ */
3192+
pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
3193+
header->dev_id);
3194+
31943195
if (wait_for_completion_interruptible(&ub->completion))
31953196
return -EINTR;
31963197

3197-
pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
3198-
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
3198+
pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
3199+
header->dev_id);
31993200

32003201
mutex_lock(&ub->mutex);
32013202
if (ublk_nosrv_should_stop_dev(ub))

include/uapi/linux/ublk_cmd.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,15 @@
272272
*/
273273
#define UBLK_F_QUIESCE (1ULL << 12)
274274

275+
/*
276+
* If this feature is set, ublk_drv supports each (qid,tag) pair having
277+
* its own independent daemon task that is responsible for handling it.
278+
* If it is not set, daemons are per-queue instead, so for two pairs
279+
* (qid1,tag1) and (qid2,tag2), if qid1 == qid2, then the same task must
280+
* be responsible for handling (qid1,tag1) and (qid2,tag2).
281+
*/
282+
#define UBLK_F_PER_IO_DAEMON (1ULL << 13)
283+
275284
/* device state */
276285
#define UBLK_S_DEV_DEAD 0
277286
#define UBLK_S_DEV_LIVE 1

0 commit comments

Comments
 (0)