Skip to content

Commit b48c312

Browse files
isilenceaxboe
authored andcommitted
io_uring/net: simplify zerocopy send user API
Following user feedback, this patch simplifies zerocopy send API. One of the main complaints is that the current API is difficult with the userspace managing notification slots, and then send retries with error handling make it even worse. Instead of keeping notification slots change it to the per-request notifications model, which posts both completion and notification CQEs for each request when any data has been sent, and only one CQE if it fails. All notification CQEs will have IORING_CQE_F_NOTIF set and IORING_CQE_F_MORE in completion CQEs indicates whether to wait a notification or not. IOSQE_CQE_SKIP_SUCCESS is disallowed with zerocopy sends for now. This is less flexible, but greatly simplifies the user API and also the kernel implementation. We reuse notif helpers in this patch, but in the future there won't be need for keeping two requests. Signed-off-by: Pavel Begunkov <[email protected]> Link: https://lore.kernel.org/r/95287640ab98fc9417370afb16e310677c63e6ce.1662027856.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <[email protected]>
1 parent 57f3322 commit b48c312

File tree

7 files changed

+47
-76
lines changed

7 files changed

+47
-76
lines changed

include/uapi/linux/io_uring.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ struct io_uring_sqe {
7171
__s32 splice_fd_in;
7272
__u32 file_index;
7373
struct {
74-
__u16 notification_idx;
7574
__u16 addr_len;
75+
__u16 __pad3[1];
7676
};
7777
};
7878
union {
@@ -205,7 +205,7 @@ enum io_uring_op {
205205
IORING_OP_GETXATTR,
206206
IORING_OP_SOCKET,
207207
IORING_OP_URING_CMD,
208-
IORING_OP_SENDZC_NOTIF,
208+
IORING_OP_SEND_ZC,
209209

210210
/* this goes last, obviously */
211211
IORING_OP_LAST,
@@ -326,10 +326,13 @@ struct io_uring_cqe {
326326
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
327327
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
328328
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
329+
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
330+
* them from sends.
329331
*/
330332
#define IORING_CQE_F_BUFFER (1U << 0)
331333
#define IORING_CQE_F_MORE (1U << 1)
332334
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
335+
#define IORING_CQE_F_NOTIF (1U << 3)
333336

334337
enum {
335338
IORING_CQE_BUFFER_SHIFT = 16,

io_uring/io_uring.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3923,8 +3923,8 @@ static int __init io_uring_init(void)
39233923
BUILD_BUG_SQE_ELEM(42, __u16, personality);
39243924
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
39253925
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
3926-
BUILD_BUG_SQE_ELEM(44, __u16, notification_idx);
3927-
BUILD_BUG_SQE_ELEM(46, __u16, addr_len);
3926+
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
3927+
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
39283928
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
39293929
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
39303930
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);

io_uring/net.c

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ struct io_sendzc {
6565
struct file *file;
6666
void __user *buf;
6767
size_t len;
68-
u16 slot_idx;
6968
unsigned msg_flags;
7069
unsigned flags;
7170
unsigned addr_len;
7271
void __user *addr;
7372
size_t done_io;
73+
struct io_kiocb *notif;
7474
};
7575

7676
#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
@@ -879,12 +879,26 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
879879
return ret;
880880
}
881881

882+
void io_sendzc_cleanup(struct io_kiocb *req)
883+
{
884+
struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
885+
886+
zc->notif->flags |= REQ_F_CQE_SKIP;
887+
io_notif_flush(zc->notif);
888+
zc->notif = NULL;
889+
}
890+
882891
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
883892
{
884893
struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
885894
struct io_ring_ctx *ctx = req->ctx;
895+
struct io_kiocb *notif;
886896

887-
if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
897+
if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
898+
READ_ONCE(sqe->__pad3[0]))
899+
return -EINVAL;
900+
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
901+
if (req->flags & REQ_F_CQE_SKIP)
888902
return -EINVAL;
889903

890904
zc->flags = READ_ONCE(sqe->ioprio);
@@ -900,11 +914,17 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
900914
req->imu = READ_ONCE(ctx->user_bufs[idx]);
901915
io_req_set_rsrc_node(req, ctx, 0);
902916
}
917+
notif = zc->notif = io_alloc_notif(ctx);
918+
if (!notif)
919+
return -ENOMEM;
920+
notif->cqe.user_data = req->cqe.user_data;
921+
notif->cqe.res = 0;
922+
notif->cqe.flags = IORING_CQE_F_NOTIF;
923+
req->flags |= REQ_F_NEED_CLEANUP;
903924

904925
zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
905926
zc->len = READ_ONCE(sqe->len);
906927
zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
907-
zc->slot_idx = READ_ONCE(sqe->notification_idx);
908928
if (zc->msg_flags & MSG_DONTWAIT)
909929
req->flags |= REQ_F_NOWAIT;
910930

@@ -976,33 +996,20 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
976996
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
977997
{
978998
struct sockaddr_storage __address, *addr = NULL;
979-
struct io_ring_ctx *ctx = req->ctx;
980999
struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
981-
struct io_notif_slot *notif_slot;
982-
struct io_kiocb *notif;
9831000
struct msghdr msg;
9841001
struct iovec iov;
9851002
struct socket *sock;
986-
unsigned msg_flags;
1003+
unsigned msg_flags, cflags;
9871004
int ret, min_ret = 0;
9881005

9891006
if (!(req->flags & REQ_F_POLLED) &&
9901007
(zc->flags & IORING_RECVSEND_POLL_FIRST))
9911008
return -EAGAIN;
992-
993-
if (issue_flags & IO_URING_F_UNLOCKED)
994-
return -EAGAIN;
9951009
sock = sock_from_file(req->file);
9961010
if (unlikely(!sock))
9971011
return -ENOTSOCK;
9981012

999-
notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
1000-
if (!notif_slot)
1001-
return -EINVAL;
1002-
notif = io_get_notif(ctx, notif_slot);
1003-
if (!notif)
1004-
return -ENOMEM;
1005-
10061013
msg.msg_name = NULL;
10071014
msg.msg_control = NULL;
10081015
msg.msg_controllen = 0;
@@ -1033,7 +1040,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
10331040
&msg.msg_iter);
10341041
if (unlikely(ret))
10351042
return ret;
1036-
ret = io_notif_account_mem(notif, zc->len);
1043+
ret = io_notif_account_mem(zc->notif, zc->len);
10371044
if (unlikely(ret))
10381045
return ret;
10391046
}
@@ -1045,7 +1052,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
10451052
min_ret = iov_iter_count(&msg.msg_iter);
10461053

10471054
msg.msg_flags = msg_flags;
1048-
msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
1055+
msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
10491056
msg.sg_from_iter = io_sg_from_iter;
10501057
ret = sock_sendmsg(sock, &msg);
10511058

@@ -1060,6 +1067,8 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
10601067
req->flags |= REQ_F_PARTIAL_IO;
10611068
return io_setup_async_addr(req, addr, issue_flags);
10621069
}
1070+
if (ret < 0 && !zc->done_io)
1071+
zc->notif->flags |= REQ_F_CQE_SKIP;
10631072
if (ret == -ERESTARTSYS)
10641073
ret = -EINTR;
10651074
req_set_fail(req);
@@ -1069,7 +1078,11 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
10691078
ret += zc->done_io;
10701079
else if (zc->done_io)
10711080
ret = zc->done_io;
1072-
io_req_set_res(req, ret, 0);
1081+
1082+
io_notif_flush(zc->notif);
1083+
req->flags &= ~REQ_F_NEED_CLEANUP;
1084+
cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
1085+
io_req_set_res(req, ret, cflags);
10731086
return IOU_OK;
10741087
}
10751088

io_uring/net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags);
5555

5656
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
5757
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
58+
void io_sendzc_cleanup(struct io_kiocb *req);
5859

5960
void io_netmsg_cache_free(struct io_cache_entry *entry);
6061
#else

io_uring/notif.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
4242
}
4343
}
4444

45-
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
46-
struct io_notif_slot *slot)
45+
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
4746
__must_hold(&ctx->uring_lock)
4847
{
4948
struct io_kiocb *notif;
@@ -59,27 +58,20 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
5958
io_get_task_refs(1);
6059
notif->rsrc_node = NULL;
6160
io_req_set_rsrc_node(notif, ctx, 0);
62-
notif->cqe.user_data = slot->tag;
63-
notif->cqe.flags = slot->seq++;
64-
notif->cqe.res = 0;
6561

6662
nd = io_notif_to_data(notif);
6763
nd->account_pages = 0;
6864
nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
6965
nd->uarg.callback = io_uring_tx_zerocopy_callback;
70-
/* master ref owned by io_notif_slot, will be dropped on flush */
7166
refcount_set(&nd->uarg.refcnt, 1);
7267
return notif;
7368
}
7469

75-
void io_notif_slot_flush(struct io_notif_slot *slot)
70+
void io_notif_flush(struct io_kiocb *notif)
7671
__must_hold(&slot->notif->ctx->uring_lock)
7772
{
78-
struct io_kiocb *notif = slot->notif;
7973
struct io_notif_data *nd = io_notif_to_data(notif);
8074

81-
slot->notif = NULL;
82-
8375
/* drop slot's master ref */
8476
if (refcount_dec_and_test(&nd->uarg.refcnt)) {
8577
notif->io_task_work.func = __io_notif_complete_tw;

io_uring/notif.h

Lines changed: 2 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -15,53 +15,14 @@ struct io_notif_data {
1515
unsigned long account_pages;
1616
};
1717

18-
struct io_notif_slot {
19-
/*
20-
* Current/active notifier. A slot holds only one active notifier at a
21-
* time and keeps one reference to it. Flush releases the reference and
22-
* lazily replaces it with a new notifier.
23-
*/
24-
struct io_kiocb *notif;
25-
26-
/*
27-
* Default ->user_data for this slot notifiers CQEs
28-
*/
29-
u64 tag;
30-
/*
31-
* Notifiers of a slot live in generations, we create a new notifier
32-
* only after flushing the previous one. Track the sequential number
33-
* for all notifiers and copy it into notifiers's cqe->cflags
34-
*/
35-
u32 seq;
36-
};
37-
38-
void io_notif_slot_flush(struct io_notif_slot *slot);
39-
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
40-
struct io_notif_slot *slot);
18+
void io_notif_flush(struct io_kiocb *notif);
19+
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
4120

4221
static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
4322
{
4423
return io_kiocb_to_cmd(notif, struct io_notif_data);
4524
}
4625

47-
static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
48-
struct io_notif_slot *slot)
49-
{
50-
if (!slot->notif)
51-
slot->notif = io_alloc_notif(ctx, slot);
52-
return slot->notif;
53-
}
54-
55-
static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
56-
unsigned idx)
57-
__must_hold(&ctx->uring_lock)
58-
{
59-
if (idx >= ctx->nr_notif_slots)
60-
return NULL;
61-
idx = array_index_nospec(idx, ctx->nr_notif_slots);
62-
return &ctx->notif_slots[idx];
63-
}
64-
6526
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
6627
{
6728
struct io_ring_ctx *ctx = notif->ctx;

io_uring/opdef.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ const struct io_op_def io_op_defs[] = {
470470
.issue = io_uring_cmd,
471471
.prep_async = io_uring_cmd_prep_async,
472472
},
473-
[IORING_OP_SENDZC_NOTIF] = {
473+
[IORING_OP_SEND_ZC] = {
474474
.name = "SENDZC_NOTIF",
475475
.needs_file = 1,
476476
.unbound_nonreg_file = 1,
@@ -483,6 +483,7 @@ const struct io_op_def io_op_defs[] = {
483483
.prep = io_sendzc_prep,
484484
.issue = io_sendzc,
485485
.prep_async = io_sendzc_prep_async,
486+
.cleanup = io_sendzc_cleanup,
486487
#else
487488
.prep = io_eopnotsupp_prep,
488489
#endif

0 commit comments

Comments
 (0)