Skip to content

Commit f227345

Browse files
Thomas Songkeithbusch
authored andcommitted
nvme-multipath: implement "queue-depth" iopolicy
The round-robin path selector is inefficient in cases where there is a difference in latency between paths. In the presence of one or more high latency paths the round-robin selector continues to use the high latency path equally. This results in a bias towards the highest latency path and can cause a significant decrease in overall performance as IOs pile on the highest latency path. This problem is acute with NVMe-oF controllers. The queue-depth path selector sends I/O down the path with the lowest number of requests in its request queue. Paths with lower latency will clear requests more quickly and have less requests queued compared to higher latency paths. The goal of this path selector is to make more use of lower latency paths which will bring down overall IO latency and increase throughput and performance. Signed-off-by: Thomas Song <[email protected]> [emilne: commandeered patch developed by Thomas Song @ Pure Storage] Co-developed-by: Ewan D. Milne <[email protected]> Signed-off-by: Ewan D. Milne <[email protected]> Co-developed-by: John Meneghini <[email protected]> Signed-off-by: John Meneghini <[email protected]> Link: https://lore.kernel.org/linux-nvme/[email protected]/ Tested-by: Marco Patalano <[email protected]> Tested-by: Jyoti Rani <[email protected]> Tested-by: John Meneghini <[email protected]> Reviewed-by: Randy Jennings <[email protected]> Reviewed-by: Hannes Reinecke <[email protected]> Reviewed-by: Sagi Grimberg <[email protected]> Reviewed-by: Chaitanya Kulkarni <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Keith Busch <[email protected]>
1 parent 3d7c2fd commit f227345

File tree

3 files changed

+87
-5
lines changed

3 files changed

+87
-5
lines changed

drivers/nvme/host/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
110110
EXPORT_SYMBOL_GPL(nvme_delete_wq);
111111

112112
static LIST_HEAD(nvme_subsystems);
113-
static DEFINE_MUTEX(nvme_subsystems_lock);
113+
DEFINE_MUTEX(nvme_subsystems_lock);
114114

115115
static DEFINE_IDA(nvme_instance_ida);
116116
static dev_t nvme_ctrl_base_chr_devt;

drivers/nvme/host/multipath.c

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
1717
static const char *nvme_iopolicy_names[] = {
1818
[NVME_IOPOLICY_NUMA] = "numa",
1919
[NVME_IOPOLICY_RR] = "round-robin",
20+
[NVME_IOPOLICY_QD] = "queue-depth",
2021
};
2122

2223
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
2930
iopolicy = NVME_IOPOLICY_NUMA;
3031
else if (!strncmp(val, "round-robin", 11))
3132
iopolicy = NVME_IOPOLICY_RR;
33+
else if (!strncmp(val, "queue-depth", 11))
34+
iopolicy = NVME_IOPOLICY_QD;
3235
else
3336
return -EINVAL;
3437

@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
4346
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
4447
&iopolicy, 0644);
4548
MODULE_PARM_DESC(iopolicy,
46-
"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
49+
"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
4750

4851
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
4952
{
@@ -128,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq)
128131
struct nvme_ns *ns = rq->q->queuedata;
129132
struct gendisk *disk = ns->head->disk;
130133

134+
if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
135+
atomic_inc(&ns->ctrl->nr_active);
136+
nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
137+
}
138+
131139
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
132140
return;
133141

@@ -141,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq)
141149
{
142150
struct nvme_ns *ns = rq->q->queuedata;
143151

152+
if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
153+
atomic_dec_if_positive(&ns->ctrl->nr_active);
154+
144155
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
145156
return;
146157
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -339,6 +350,42 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
339350
return found;
340351
}
341352

353+
static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
354+
{
355+
struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
356+
unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
357+
unsigned int depth;
358+
359+
list_for_each_entry_rcu(ns, &head->list, siblings) {
360+
if (nvme_path_is_disabled(ns))
361+
continue;
362+
363+
depth = atomic_read(&ns->ctrl->nr_active);
364+
365+
switch (ns->ana_state) {
366+
case NVME_ANA_OPTIMIZED:
367+
if (depth < min_depth_opt) {
368+
min_depth_opt = depth;
369+
best_opt = ns;
370+
}
371+
break;
372+
case NVME_ANA_NONOPTIMIZED:
373+
if (depth < min_depth_nonopt) {
374+
min_depth_nonopt = depth;
375+
best_nonopt = ns;
376+
}
377+
break;
378+
default:
379+
break;
380+
}
381+
382+
if (min_depth_opt == 0)
383+
return best_opt;
384+
}
385+
386+
return best_opt ? best_opt : best_nonopt;
387+
}
388+
342389
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
343390
{
344391
return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
@@ -360,9 +407,14 @@ static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
360407

361408
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
362409
{
363-
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
410+
switch (READ_ONCE(head->subsys->iopolicy)) {
411+
case NVME_IOPOLICY_QD:
412+
return nvme_queue_depth_path(head);
413+
case NVME_IOPOLICY_RR:
364414
return nvme_round_robin_path(head);
365-
return nvme_numa_path(head);
415+
default:
416+
return nvme_numa_path(head);
417+
}
366418
}
367419

368420
static bool nvme_available_path(struct nvme_ns_head *head)
@@ -794,6 +846,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
794846
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
795847
}
796848

849+
static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
850+
int iopolicy)
851+
{
852+
struct nvme_ctrl *ctrl;
853+
int old_iopolicy = READ_ONCE(subsys->iopolicy);
854+
855+
if (old_iopolicy == iopolicy)
856+
return;
857+
858+
WRITE_ONCE(subsys->iopolicy, iopolicy);
859+
860+
/* iopolicy changes clear the mpath by design */
861+
mutex_lock(&nvme_subsystems_lock);
862+
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
863+
nvme_mpath_clear_ctrl_paths(ctrl);
864+
mutex_unlock(&nvme_subsystems_lock);
865+
866+
pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
867+
subsys->subnqn,
868+
nvme_iopolicy_names[old_iopolicy],
869+
nvme_iopolicy_names[iopolicy]);
870+
}
871+
797872
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
798873
struct device_attribute *attr, const char *buf, size_t count)
799874
{
@@ -803,7 +878,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
803878

804879
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
805880
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
806-
WRITE_ONCE(subsys->iopolicy, i);
881+
nvme_subsys_iopolicy_update(subsys, i);
807882
return count;
808883
}
809884
}
@@ -911,6 +986,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
911986
!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
912987
return 0;
913988

989+
/* initialize this in the identify path to cover controller resets */
990+
atomic_set(&ctrl->nr_active, 0);
991+
914992
if (!ctrl->max_namespaces ||
915993
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
916994
dev_err(ctrl->device,

drivers/nvme/host/nvme.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ extern unsigned int admin_timeout;
4949
extern struct workqueue_struct *nvme_wq;
5050
extern struct workqueue_struct *nvme_reset_wq;
5151
extern struct workqueue_struct *nvme_delete_wq;
52+
extern struct mutex nvme_subsystems_lock;
5253

5354
/*
5455
* List of workarounds for devices that required behavior not specified in
@@ -195,6 +196,7 @@ enum {
195196
NVME_REQ_CANCELLED = (1 << 0),
196197
NVME_REQ_USERCMD = (1 << 1),
197198
NVME_MPATH_IO_STATS = (1 << 2),
199+
NVME_MPATH_CNT_ACTIVE = (1 << 3),
198200
};
199201

200202
static inline struct nvme_request *nvme_req(struct request *req)
@@ -360,6 +362,7 @@ struct nvme_ctrl {
360362
size_t ana_log_size;
361363
struct timer_list anatt_timer;
362364
struct work_struct ana_work;
365+
atomic_t nr_active;
363366
#endif
364367

365368
#ifdef CONFIG_NVME_HOST_AUTH
@@ -408,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
408411
enum nvme_iopolicy {
409412
NVME_IOPOLICY_NUMA,
410413
NVME_IOPOLICY_RR,
414+
NVME_IOPOLICY_QD,
411415
};
412416

413417
struct nvme_subsystem {

0 commit comments

Comments
 (0)