Skip to content

Commit e540341

Browse files
committed
Merge tag 'block-6.16-20250626' of git://git.kernel.dk/linux
Pull block fixes from Jens Axboe: - Fixes for ublk: - fix C++ narrowing warnings in the uapi header - update/improve UBLK_F_SUPPORT_ZERO_COPY comment in uapi header - fix for the ublk ->queue_rqs() implementation, limiting a batch to just the specific task AND ring - ublk_get_data() error handling fix - sanity check more arguments in ublk_ctrl_add_dev() - selftest addition - NVMe pull request via Christoph: - reset delayed remove_work after reconnect - fix atomic write size validation - Fix for a warning introduced in bdev_count_inflight_rw() in this merge window * tag 'block-6.16-20250626' of git://git.kernel.dk/linux: block: fix false warning in bdev_count_inflight_rw() ublk: sanity check add_dev input for underflow nvme: fix atomic write size validation nvme: refactor the atomic write unit detection nvme: reset delayed remove_work after reconnect ublk: setup ublk_io correctly in case of ublk_get_data() failure ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header ublk: fix narrowing warnings in UAPI header selftests: ublk: don't take same backing file for more than one ublk devices ublk: build batch from IOs in same io_ring_ctx and io task
2 parents 0a47e02 + c007062 commit e540341

File tree

7 files changed

+125
-79
lines changed

7 files changed

+125
-79
lines changed

block/genhd.c

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -128,23 +128,27 @@ static void part_stat_read_all(struct block_device *part,
128128
static void bdev_count_inflight_rw(struct block_device *part,
129129
unsigned int inflight[2], bool mq_driver)
130130
{
131+
int write = 0;
132+
int read = 0;
131133
int cpu;
132134

133135
if (mq_driver) {
134136
blk_mq_in_driver_rw(part, inflight);
135-
} else {
136-
for_each_possible_cpu(cpu) {
137-
inflight[READ] += part_stat_local_read_cpu(
138-
part, in_flight[READ], cpu);
139-
inflight[WRITE] += part_stat_local_read_cpu(
140-
part, in_flight[WRITE], cpu);
141-
}
137+
return;
138+
}
139+
140+
for_each_possible_cpu(cpu) {
141+
read += part_stat_local_read_cpu(part, in_flight[READ], cpu);
142+
write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu);
142143
}
143144

144-
if (WARN_ON_ONCE((int)inflight[READ] < 0))
145-
inflight[READ] = 0;
146-
if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
147-
inflight[WRITE] = 0;
145+
/*
146+
* While iterating all CPUs, some IOs may be issued from a CPU already
147+
* traversed and complete on a CPU that has not yet been traversed,
148+
* causing the inflight number to be negative.
149+
*/
150+
inflight[READ] = read > 0 ? read : 0;
151+
inflight[WRITE] = write > 0 ? write : 0;
148152
}
149153

150154
/**

drivers/block/ublk_drv.c

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,8 +1148,8 @@ static inline void __ublk_complete_rq(struct request *req)
11481148
blk_mq_end_request(req, res);
11491149
}
11501150

1151-
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1152-
int res, unsigned issue_flags)
1151+
static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1152+
struct request *req)
11531153
{
11541154
/* read cmd first because req will overwrite it */
11551155
struct io_uring_cmd *cmd = io->cmd;
@@ -1164,6 +1164,13 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
11641164
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
11651165

11661166
io->req = req;
1167+
return cmd;
1168+
}
1169+
1170+
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1171+
int res, unsigned issue_flags)
1172+
{
1173+
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
11671174

11681175
/* tell ublksrv one io request is coming */
11691176
io_uring_cmd_done(cmd, res, 0, issue_flags);
@@ -1416,6 +1423,14 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
14161423
return BLK_STS_OK;
14171424
}
14181425

1426+
static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
1427+
const struct ublk_io *io2)
1428+
{
1429+
return (io_uring_cmd_ctx_handle(io->cmd) ==
1430+
io_uring_cmd_ctx_handle(io2->cmd)) &&
1431+
(io->task == io2->task);
1432+
}
1433+
14191434
static void ublk_queue_rqs(struct rq_list *rqlist)
14201435
{
14211436
struct rq_list requeue_list = { };
@@ -1427,7 +1442,8 @@ static void ublk_queue_rqs(struct rq_list *rqlist)
14271442
struct ublk_queue *this_q = req->mq_hctx->driver_data;
14281443
struct ublk_io *this_io = &this_q->ios[req->tag];
14291444

1430-
if (io && io->task != this_io->task && !rq_list_empty(&submit_list))
1445+
if (io && !ublk_belong_to_same_batch(io, this_io) &&
1446+
!rq_list_empty(&submit_list))
14311447
ublk_queue_cmd_list(io, &submit_list);
14321448
io = this_io;
14331449

@@ -2148,10 +2164,9 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
21482164
return 0;
21492165
}
21502166

2151-
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
2167+
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
2168+
struct request *req)
21522169
{
2153-
struct request *req = io->req;
2154-
21552170
/*
21562171
* We have handled UBLK_IO_NEED_GET_DATA command,
21572172
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
@@ -2178,6 +2193,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
21782193
u32 cmd_op = cmd->cmd_op;
21792194
unsigned tag = ub_cmd->tag;
21802195
int ret = -EINVAL;
2196+
struct request *req;
21812197

21822198
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
21832199
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@@ -2236,11 +2252,19 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
22362252
goto out;
22372253
break;
22382254
case UBLK_IO_NEED_GET_DATA:
2239-
io->addr = ub_cmd->addr;
2240-
if (!ublk_get_data(ubq, io))
2241-
return -EIOCBQUEUED;
2242-
2243-
return UBLK_IO_RES_OK;
2255+
/*
2256+
* ublk_get_data() may fail and fallback to requeue, so keep
2257+
* uring_cmd active first and prepare for handling new requeued
2258+
* request
2259+
*/
2260+
req = io->req;
2261+
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
2262+
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
2263+
if (likely(ublk_get_data(ubq, io, req))) {
2264+
__ublk_prep_compl_io_cmd(io, req);
2265+
return UBLK_IO_RES_OK;
2266+
}
2267+
break;
22442268
default:
22452269
goto out;
22462270
}
@@ -2825,7 +2849,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
28252849
if (copy_from_user(&info, argp, sizeof(info)))
28262850
return -EFAULT;
28272851

2828-
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES)
2852+
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
2853+
info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
28292854
return -EINVAL;
28302855

28312856
if (capable(CAP_SYS_ADMIN))

drivers/nvme/host/core.c

Lines changed: 42 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,21 +2015,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
20152015
}
20162016

20172017

2018-
static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
2019-
struct nvme_id_ns *id, struct queue_limits *lim,
2020-
u32 bs, u32 atomic_bs)
2018+
static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
2019+
struct nvme_id_ns *id, struct queue_limits *lim, u32 bs)
20212020
{
2022-
unsigned int boundary = 0;
2021+
u32 atomic_bs, boundary = 0;
20232022

2024-
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
2025-
if (le16_to_cpu(id->nabspf))
2023+
/*
2024+
* We do not support an offset for the atomic boundaries.
2025+
*/
2026+
if (id->nabo)
2027+
return bs;
2028+
2029+
if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
2030+
/*
2031+
* Use the per-namespace atomic write unit when available.
2032+
*/
2033+
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2034+
if (id->nabspf)
20262035
boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
2036+
} else {
2037+
/*
2038+
* Use the controller wide atomic write unit. This sucks
2039+
* because the limit is defined in terms of logical blocks while
2040+
* namespaces can have different formats, and because there is
2041+
* no clear language in the specification prohibiting different
2042+
* values for different controllers in the subsystem.
2043+
*/
2044+
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
20272045
}
2046+
20282047
lim->atomic_write_hw_max = atomic_bs;
20292048
lim->atomic_write_hw_boundary = boundary;
20302049
lim->atomic_write_hw_unit_min = bs;
20312050
lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
20322051
lim->features |= BLK_FEAT_ATOMIC_WRITES;
2052+
return atomic_bs;
20332053
}
20342054

20352055
static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
@@ -2067,34 +2087,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
20672087
valid = false;
20682088
}
20692089

2070-
atomic_bs = phys_bs = bs;
2071-
if (id->nabo == 0) {
2072-
/*
2073-
* Bit 1 indicates whether NAWUPF is defined for this namespace
2074-
* and whether it should be used instead of AWUPF. If NAWUPF ==
2075-
* 0 then AWUPF must be used instead.
2076-
*/
2077-
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2078-
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2079-
else
2080-
atomic_bs = (1 + ns->ctrl->awupf) * bs;
2081-
2082-
/*
2083-
* Set subsystem atomic bs.
2084-
*/
2085-
if (ns->ctrl->subsys->atomic_bs) {
2086-
if (atomic_bs != ns->ctrl->subsys->atomic_bs) {
2087-
dev_err_ratelimited(ns->ctrl->device,
2088-
"%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n",
2089-
ns->disk ? ns->disk->disk_name : "?",
2090-
ns->ctrl->subsys->atomic_bs,
2091-
atomic_bs);
2092-
}
2093-
} else
2094-
ns->ctrl->subsys->atomic_bs = atomic_bs;
2095-
2096-
nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
2097-
}
2090+
phys_bs = bs;
2091+
atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);
20982092

20992093
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
21002094
/* NPWG = Namespace Preferred Write Granularity */
@@ -2382,16 +2376,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
23822376
if (!nvme_update_disk_info(ns, id, &lim))
23832377
capacity = 0;
23842378

2385-
/*
2386-
* Validate the max atomic write size fits within the subsystem's
2387-
* atomic write capabilities.
2388-
*/
2389-
if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) {
2390-
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
2391-
ret = -ENXIO;
2392-
goto out;
2393-
}
2394-
23952379
nvme_config_discard(ns, &lim);
23962380
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
23972381
ns->head->ids.csi == NVME_CSI_ZNS)
@@ -3215,6 +3199,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
32153199
memcpy(subsys->model, id->mn, sizeof(subsys->model));
32163200
subsys->vendor_id = le16_to_cpu(id->vid);
32173201
subsys->cmic = id->cmic;
3202+
subsys->awupf = le16_to_cpu(id->awupf);
32183203

32193204
/* Versions prior to 1.4 don't necessarily report a valid type */
32203205
if (id->cntrltype == NVME_CTRL_DISC ||
@@ -3552,6 +3537,15 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
35523537
if (ret)
35533538
goto out_free;
35543539
}
3540+
3541+
if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) {
3542+
dev_err_ratelimited(ctrl->device,
3543+
"inconsistent AWUPF, controller not added (%u/%u).\n",
3544+
le16_to_cpu(id->awupf), ctrl->subsys->awupf);
3545+
ret = -EINVAL;
3546+
goto out_free;
3547+
}
3548+
35553549
memcpy(ctrl->subsys->firmware_rev, id->fr,
35563550
sizeof(ctrl->subsys->firmware_rev));
35573551

@@ -3647,7 +3641,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
36473641
dev_pm_qos_expose_latency_tolerance(ctrl->device);
36483642
else if (!ctrl->apst_enabled && prev_apst_enabled)
36493643
dev_pm_qos_hide_latency_tolerance(ctrl->device);
3650-
ctrl->awupf = le16_to_cpu(id->awupf);
36513644
out_free:
36523645
kfree(id);
36533646
return ret;
@@ -4036,6 +4029,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
40364029
list_add_tail_rcu(&ns->siblings, &head->list);
40374030
ns->head = head;
40384031
mutex_unlock(&ctrl->subsys->lock);
4032+
4033+
#ifdef CONFIG_NVME_MULTIPATH
4034+
cancel_delayed_work(&head->remove_work);
4035+
#endif
40394036
return 0;
40404037

40414038
out_put_ns_head:

drivers/nvme/host/multipath.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1311,7 +1311,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
13111311
*/
13121312
if (!try_module_get(THIS_MODULE))
13131313
goto out;
1314-
queue_delayed_work(nvme_wq, &head->remove_work,
1314+
mod_delayed_work(nvme_wq, &head->remove_work,
13151315
head->delayed_removal_secs * HZ);
13161316
} else {
13171317
list_del_init(&head->entry);

drivers/nvme/host/nvme.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,6 @@ struct nvme_ctrl {
410410

411411
enum nvme_ctrl_type cntrltype;
412412
enum nvme_dctype dctype;
413-
u16 awupf; /* 0's based value. */
414413
};
415414

416415
static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
@@ -443,11 +442,11 @@ struct nvme_subsystem {
443442
u8 cmic;
444443
enum nvme_subsys_type subtype;
445444
u16 vendor_id;
445+
u16 awupf; /* 0's based value. */
446446
struct ida ns_ida;
447447
#ifdef CONFIG_NVME_MULTIPATH
448448
enum nvme_iopolicy iopolicy;
449449
#endif
450-
u32 atomic_bs;
451450
};
452451

453452
/*

include/uapi/linux/ublk_cmd.h

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,28 @@
135135
#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
136136

137137
/*
138-
* zero copy requires 4k block size, and can remap ublk driver's io
139-
* request into ublksrv's vm space
138+
* ublk server can register data buffers for incoming I/O requests with a sparse
139+
* io_uring buffer table. The request buffer can then be used as the data buffer
140+
* for io_uring operations via the fixed buffer index.
141+
* Note that the ublk server can never directly access the request data memory.
142+
*
143+
* To use this feature, the ublk server must first register a sparse buffer
144+
* table on an io_uring instance.
145+
* When an incoming ublk request is received, the ublk server submits a
146+
* UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The
147+
* ublksrv_io_cmd's q_id and tag specify the request whose buffer to register
148+
* and addr is the index in the io_uring's buffer table to install the buffer.
149+
* SQEs can now be submitted to the io_uring to read/write the request's buffer
150+
* by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or
151+
* IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index.
152+
* Once the last io_uring operation using the request's buffer has completed,
153+
* the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag,
154+
* and addr again specifying the request buffer to unregister.
155+
* The ublk request is completed when its buffer is unregistered from all
156+
* io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ.
157+
*
158+
* Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak
159+
* uninitialized kernel memory by not reading into the full request buffer.
140160
*/
141161
#define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0)
142162

@@ -450,10 +470,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg(
450470
__u64 sqe_addr)
451471
{
452472
struct ublk_auto_buf_reg reg = {
453-
.index = sqe_addr & 0xffff,
454-
.flags = (sqe_addr >> 16) & 0xff,
455-
.reserved0 = (sqe_addr >> 24) & 0xff,
456-
.reserved1 = sqe_addr >> 32,
473+
.index = (__u16)sqe_addr,
474+
.flags = (__u8)(sqe_addr >> 16),
475+
.reserved0 = (__u8)(sqe_addr >> 24),
476+
.reserved1 = (__u32)(sqe_addr >> 32),
457477
};
458478

459479
return reg;

tools/testing/selftests/ublk/test_stress_03.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,23 @@ _create_backfile 2 128M
3232
ublk_io_and_remove 8G -t null -q 4 -z &
3333
ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
3434
ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
35+
wait
3536

3637
if _have_feature "AUTO_BUF_REG"; then
3738
ublk_io_and_remove 8G -t null -q 4 --auto_zc &
3839
ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" &
3940
ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
4041
ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
42+
wait
4143
fi
42-
wait
4344

4445
if _have_feature "PER_IO_DAEMON"; then
4546
ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks &
4647
ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
4748
ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
4849
ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks &
50+
wait
4951
fi
50-
wait
5152

5253
_cleanup_test "stress"
5354
_show_result $TID $ERR_CODE

0 commit comments

Comments
 (0)