Skip to content

Commit 2f0a692

Browse files
Ming Leiaxboe
authored andcommitted
selftests: ublk: set queue pthread's cpu affinity
In NUMA machine, ublk IO performance is very sensitive with queue pthread's affinity setting. Retrieve queue's affinity and select the 1st cpu as queue thread's sched affinity, and it is observed that single cpu task affinity can get stable & good performance if client application is put on proper cpu. Dump this info when adding one ublk device. Use shmem to communicate queue's tid between parent and daemon. Signed-off-by: Ming Lei <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 62867a0 commit 2f0a692

File tree

2 files changed

+159
-8
lines changed

2 files changed

+159
-8
lines changed

tools/testing/selftests/ublk/kublk.c

Lines changed: 149 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,73 @@ static const char *ublk_dev_state_desc(struct ublk_dev *dev)
206206
};
207207
}
208208

209+
static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
210+
{
211+
unsigned done = 0;
212+
int i;
213+
214+
for (i = 0; i < CPU_SETSIZE; i++) {
215+
if (CPU_ISSET(i, set))
216+
done += snprintf(&buf[done], len - done, "%d ", i);
217+
}
218+
}
219+
220+
static void ublk_adjust_affinity(cpu_set_t *set)
221+
{
222+
int j, updated = 0;
223+
224+
/*
225+
* Just keep the 1st CPU now.
226+
*
227+
* In future, auto affinity selection can be tried.
228+
*/
229+
for (j = 0; j < CPU_SETSIZE; j++) {
230+
if (CPU_ISSET(j, set)) {
231+
if (!updated) {
232+
updated = 1;
233+
continue;
234+
}
235+
CPU_CLR(j, set);
236+
}
237+
}
238+
}
239+
240+
/* Caller must free the allocated buffer */
241+
static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
242+
{
243+
struct ublk_ctrl_cmd_data data = {
244+
.cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
245+
.flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
246+
};
247+
cpu_set_t *buf;
248+
int i, ret;
249+
250+
buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
251+
if (!buf)
252+
return -ENOMEM;
253+
254+
for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
255+
data.data[0] = i;
256+
data.len = sizeof(cpu_set_t);
257+
data.addr = (__u64)&buf[i];
258+
259+
ret = __ublk_ctrl_cmd(ctrl_dev, &data);
260+
if (ret < 0) {
261+
free(buf);
262+
return ret;
263+
}
264+
ublk_adjust_affinity(&buf[i]);
265+
}
266+
267+
*ptr_buf = buf;
268+
return 0;
269+
}
270+
209271
static void ublk_ctrl_dump(struct ublk_dev *dev)
210272
{
211273
struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
212274
struct ublk_params p;
275+
cpu_set_t *affinity;
213276
int ret;
214277

215278
ret = ublk_ctrl_get_params(dev, &p);
@@ -218,12 +281,31 @@ static void ublk_ctrl_dump(struct ublk_dev *dev)
218281
return;
219282
}
220283

284+
ret = ublk_ctrl_get_affinity(dev, &affinity);
285+
if (ret < 0) {
286+
ublk_err("failed to get affinity %m\n");
287+
return;
288+
}
289+
221290
ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
222291
info->dev_id, info->nr_hw_queues, info->queue_depth,
223292
1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
224293
ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
225294
info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
226295
ublk_dev_state_desc(dev));
296+
297+
if (affinity) {
298+
char buf[512];
299+
int i;
300+
301+
for (i = 0; i < info->nr_hw_queues; i++) {
302+
ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
303+
printf("\tqueue %u: tid %d affinity(%s)\n",
304+
i, dev->q[i].tid, buf);
305+
}
306+
free(affinity);
307+
}
308+
227309
fflush(stdout);
228310
}
229311

@@ -603,9 +685,24 @@ static int ublk_process_io(struct ublk_queue *q)
603685
return reapped;
604686
}
605687

688+
static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
689+
cpu_set_t *cpuset)
690+
{
691+
if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
692+
ublk_err("ublk dev %u queue %u set affinity failed",
693+
q->dev->dev_info.dev_id, q->q_id);
694+
}
695+
696+
struct ublk_queue_info {
697+
struct ublk_queue *q;
698+
sem_t *queue_sem;
699+
cpu_set_t *affinity;
700+
};
701+
606702
static void *ublk_io_handler_fn(void *data)
607703
{
608-
struct ublk_queue *q = data;
704+
struct ublk_queue_info *info = data;
705+
struct ublk_queue *q = info->q;
609706
int dev_id = q->dev->dev_info.dev_id;
610707
int ret;
611708

@@ -615,6 +712,10 @@ static void *ublk_io_handler_fn(void *data)
615712
dev_id, q->q_id);
616713
return NULL;
617714
}
715+
/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
716+
ublk_queue_set_sched_affinity(q, info->affinity);
717+
sem_post(info->queue_sem);
718+
618719
ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
619720
q->tid, dev_id, q->q_id);
620721

@@ -640,7 +741,7 @@ static void ublk_set_parameters(struct ublk_dev *dev)
640741
dev->dev_info.dev_id, ret);
641742
}
642743

643-
static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id)
744+
static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
644745
{
645746
uint64_t id;
646747
int evtfd = ctx->_evtfd;
@@ -653,35 +754,61 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id)
653754
else
654755
id = ERROR_EVTFD_DEVID;
655756

757+
if (dev && ctx->shadow_dev)
758+
memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
759+
656760
if (write(evtfd, &id, sizeof(id)) != sizeof(id))
657761
return -EINVAL;
658762

659763
close(evtfd);
764+
shmdt(ctx->shadow_dev);
660765

661766
return 0;
662767
}
663768

664769

665770
static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
666771
{
667-
int ret, i;
668-
void *thread_ret;
669772
const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
773+
struct ublk_queue_info *qinfo;
774+
cpu_set_t *affinity_buf;
775+
void *thread_ret;
776+
sem_t queue_sem;
777+
int ret, i;
670778

671779
ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
672780

781+
qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
782+
dinfo->nr_hw_queues);
783+
if (!qinfo)
784+
return -ENOMEM;
785+
786+
sem_init(&queue_sem, 0, 0);
673787
ret = ublk_dev_prep(ctx, dev);
674788
if (ret)
675789
return ret;
676790

791+
ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
792+
if (ret)
793+
return ret;
794+
677795
for (i = 0; i < dinfo->nr_hw_queues; i++) {
678796
dev->q[i].dev = dev;
679797
dev->q[i].q_id = i;
798+
799+
qinfo[i].q = &dev->q[i];
800+
qinfo[i].queue_sem = &queue_sem;
801+
qinfo[i].affinity = &affinity_buf[i];
680802
pthread_create(&dev->q[i].thread, NULL,
681803
ublk_io_handler_fn,
682-
&dev->q[i]);
804+
&qinfo[i]);
683805
}
684806

807+
for (i = 0; i < dinfo->nr_hw_queues; i++)
808+
sem_wait(&queue_sem);
809+
free(qinfo);
810+
free(affinity_buf);
811+
685812
/* everything is fine now, start us */
686813
ublk_set_parameters(dev);
687814
ret = ublk_ctrl_start_dev(dev, getpid());
@@ -694,7 +821,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
694821
if (ctx->fg)
695822
ublk_ctrl_dump(dev);
696823
else
697-
ublk_send_dev_event(ctx, dev->dev_info.dev_id);
824+
ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
698825

699826
/* wait until we are terminated */
700827
for (i = 0; i < dinfo->nr_hw_queues; i++)
@@ -873,7 +1000,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
8731000

8741001
fail:
8751002
if (ret < 0)
876-
ublk_send_dev_event(ctx, -1);
1003+
ublk_send_dev_event(ctx, dev, -1);
8771004
ublk_ctrl_deinit(dev);
8781005
return ret;
8791006
}
@@ -887,6 +1014,16 @@ static int cmd_dev_add(struct dev_ctx *ctx)
8871014
if (ctx->fg)
8881015
goto run;
8891016

1017+
ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1018+
if (ctx->_shmid < 0) {
1019+
ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1020+
exit(-1);
1021+
}
1022+
ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1023+
if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1024+
ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1025+
exit(-1);
1026+
}
8901027
ctx->_evtfd = eventfd(0, 0);
8911028
if (ctx->_evtfd < 0) {
8921029
ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
@@ -922,6 +1059,8 @@ static int cmd_dev_add(struct dev_ctx *ctx)
9221059
if (__cmd_dev_list(ctx) >= 0)
9231060
exit_code = EXIT_SUCCESS;
9241061
}
1062+
shmdt(ctx->shadow_dev);
1063+
shmctl(ctx->_shmid, IPC_RMID, NULL);
9251064
/* wait for child and detach from it */
9261065
wait(NULL);
9271066
exit(exit_code);
@@ -988,6 +1127,9 @@ static int __cmd_dev_list(struct dev_ctx *ctx)
9881127
ublk_err("%s: can't get dev info from %d: %d\n",
9891128
__func__, ctx->dev_id, ret);
9901129
} else {
1130+
if (ctx->shadow_dev)
1131+
memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1132+
9911133
ublk_ctrl_dump(dev);
9921134
}
9931135

tools/testing/selftests/ublk/kublk.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,15 @@
2020
#include <sys/wait.h>
2121
#include <sys/eventfd.h>
2222
#include <sys/uio.h>
23+
#include <sys/ipc.h>
24+
#include <sys/shm.h>
2325
#include <linux/io_uring.h>
2426
#include <liburing.h>
25-
#include <linux/ublk_cmd.h>
27+
#include <semaphore.h>
28+
29+
/* allow ublk_dep.h to override ublk_cmd.h */
2630
#include "ublk_dep.h"
31+
#include <linux/ublk_cmd.h>
2732

2833
#define __maybe_unused __attribute__((unused))
2934
#define MAX_BACK_FILES 4
@@ -74,6 +79,10 @@ struct dev_ctx {
7479
unsigned int chunk_size;
7580

7681
int _evtfd;
82+
int _shmid;
83+
84+
/* built from shmem, only for ublk_dump_dev() */
85+
struct ublk_dev *shadow_dev;
7786
};
7887

7988
struct ublk_ctrl_cmd_data {

0 commit comments

Comments
 (0)