Skip to content

Commit 34a3e60

Browse files
isilenceaxboe
authored andcommitted
io_uring/zcrx: implement zerocopy receive pp memory provider
Implement a page pool memory provider for io_uring to receieve in a zero copy fashion. For that, the provider allocates user pages wrapped around into struct net_iovs, that are stored in a previously registered struct net_iov_area. Unlike the traditional receive, that frees pages and returns them back to the page pool right after data was copied to the user, e.g. inside recv(2), we extend the lifetime until the user space confirms that it's done processing the data. That's done by taking a net_iov reference. When the user is done with the buffer, it must return it back to the kernel by posting an entry into the refill ring, which is usually polled off the io_uring memory provider callback in the page pool's netmem allocation path. There is also a separate set of per net_iov "user" references accounting whether a buffer is currently given to the user (including possible fragmentation). Reviewed-by: Jens Axboe <[email protected]> Reviewed-by: Mina Almasry <[email protected]> Signed-off-by: Pavel Begunkov <[email protected]> Signed-off-by: David Wei <[email protected]> Acked-by: Jakub Kicinski <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 035af94 commit 34a3e60

File tree

2 files changed

+277
-0
lines changed

2 files changed

+277
-0
lines changed

io_uring/zcrx.c

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,17 @@
22
#include <linux/kernel.h>
33
#include <linux/errno.h>
44
#include <linux/mm.h>
5+
#include <linux/nospec.h>
56
#include <linux/io_uring.h>
67
#include <linux/netdevice.h>
78
#include <linux/rtnetlink.h>
89

10+
#include <net/page_pool/helpers.h>
11+
#include <net/page_pool/memory_provider.h>
12+
#include <net/netlink.h>
13+
14+
#include <trace/events/page_pool.h>
15+
916
#include <uapi/linux/io_uring.h>
1017

1118
#include "io_uring.h"
@@ -16,6 +23,33 @@
1623

1724
#define IO_RQ_MAX_ENTRIES 32768
1825

26+
__maybe_unused
27+
static const struct memory_provider_ops io_uring_pp_zc_ops;
28+
29+
static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
30+
{
31+
struct net_iov_area *owner = net_iov_owner(niov);
32+
33+
return container_of(owner, struct io_zcrx_area, nia);
34+
}
35+
36+
static inline atomic_t *io_get_user_counter(struct net_iov *niov)
37+
{
38+
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
39+
40+
return &area->user_refs[net_iov_idx(niov)];
41+
}
42+
43+
static bool io_zcrx_put_niov_uref(struct net_iov *niov)
44+
{
45+
atomic_t *uref = io_get_user_counter(niov);
46+
47+
if (unlikely(!atomic_read(uref)))
48+
return false;
49+
atomic_dec(uref);
50+
return true;
51+
}
52+
1953
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
2054
struct io_uring_zcrx_ifq_reg *reg,
2155
struct io_uring_region_desc *rd)
@@ -51,6 +85,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
5185
{
5286
kvfree(area->freelist);
5387
kvfree(area->nia.niovs);
88+
kvfree(area->user_refs);
5489
if (area->pages) {
5590
unpin_user_pages(area->pages, area->nia.num_niovs);
5691
kvfree(area->pages);
@@ -106,6 +141,19 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
106141
for (i = 0; i < nr_pages; i++)
107142
area->freelist[i] = i;
108143

144+
area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]),
145+
GFP_KERNEL | __GFP_ZERO);
146+
if (!area->user_refs)
147+
goto err;
148+
149+
for (i = 0; i < nr_pages; i++) {
150+
struct net_iov *niov = &area->nia.niovs[i];
151+
152+
niov->owner = &area->nia;
153+
area->freelist[i] = i;
154+
atomic_set(&area->user_refs[i], 0);
155+
}
156+
109157
area->free_count = nr_pages;
110158
area->ifq = ifq;
111159
/* we're only supporting one area per ifq for now */
@@ -131,6 +179,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
131179
ifq->if_rxq = -1;
132180
ifq->ctx = ctx;
133181
spin_lock_init(&ifq->lock);
182+
spin_lock_init(&ifq->rq_lock);
134183
return ifq;
135184
}
136185

@@ -256,7 +305,232 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
256305
io_zcrx_ifq_free(ifq);
257306
}
258307

308+
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
309+
{
310+
unsigned niov_idx;
311+
312+
lockdep_assert_held(&area->freelist_lock);
313+
314+
niov_idx = area->freelist[--area->free_count];
315+
return &area->nia.niovs[niov_idx];
316+
}
317+
318+
static void io_zcrx_return_niov_freelist(struct net_iov *niov)
319+
{
320+
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
321+
322+
spin_lock_bh(&area->freelist_lock);
323+
area->freelist[area->free_count++] = net_iov_idx(niov);
324+
spin_unlock_bh(&area->freelist_lock);
325+
}
326+
327+
static void io_zcrx_return_niov(struct net_iov *niov)
328+
{
329+
netmem_ref netmem = net_iov_to_netmem(niov);
330+
331+
page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
332+
}
333+
334+
static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
335+
{
336+
struct io_zcrx_area *area = ifq->area;
337+
int i;
338+
339+
if (!area)
340+
return;
341+
342+
/* Reclaim back all buffers given to the user space. */
343+
for (i = 0; i < area->nia.num_niovs; i++) {
344+
struct net_iov *niov = &area->nia.niovs[i];
345+
int nr;
346+
347+
if (!atomic_read(io_get_user_counter(niov)))
348+
continue;
349+
nr = atomic_xchg(io_get_user_counter(niov), 0);
350+
if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
351+
io_zcrx_return_niov(niov);
352+
}
353+
}
354+
259355
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
260356
{
261357
lockdep_assert_held(&ctx->uring_lock);
358+
359+
if (ctx->ifq)
360+
io_zcrx_scrub(ctx->ifq);
361+
}
362+
363+
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
364+
{
365+
u32 entries;
366+
367+
entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
368+
return min(entries, ifq->rq_entries);
262369
}
370+
371+
static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
372+
unsigned mask)
373+
{
374+
unsigned int idx = ifq->cached_rq_head++ & mask;
375+
376+
return &ifq->rqes[idx];
377+
}
378+
379+
static void io_zcrx_ring_refill(struct page_pool *pp,
380+
struct io_zcrx_ifq *ifq)
381+
{
382+
unsigned int mask = ifq->rq_entries - 1;
383+
unsigned int entries;
384+
netmem_ref netmem;
385+
386+
spin_lock_bh(&ifq->rq_lock);
387+
388+
entries = io_zcrx_rqring_entries(ifq);
389+
entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
390+
if (unlikely(!entries)) {
391+
spin_unlock_bh(&ifq->rq_lock);
392+
return;
393+
}
394+
395+
do {
396+
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
397+
struct io_zcrx_area *area;
398+
struct net_iov *niov;
399+
unsigned niov_idx, area_idx;
400+
401+
area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
402+
niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT;
403+
404+
if (unlikely(rqe->__pad || area_idx))
405+
continue;
406+
area = ifq->area;
407+
408+
if (unlikely(niov_idx >= area->nia.num_niovs))
409+
continue;
410+
niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
411+
412+
niov = &area->nia.niovs[niov_idx];
413+
if (!io_zcrx_put_niov_uref(niov))
414+
continue;
415+
416+
netmem = net_iov_to_netmem(niov);
417+
if (page_pool_unref_netmem(netmem, 1) != 0)
418+
continue;
419+
420+
if (unlikely(niov->pp != pp)) {
421+
io_zcrx_return_niov(niov);
422+
continue;
423+
}
424+
425+
net_mp_netmem_place_in_cache(pp, netmem);
426+
} while (--entries);
427+
428+
smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
429+
spin_unlock_bh(&ifq->rq_lock);
430+
}
431+
432+
static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
433+
{
434+
struct io_zcrx_area *area = ifq->area;
435+
436+
spin_lock_bh(&area->freelist_lock);
437+
while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
438+
struct net_iov *niov = __io_zcrx_get_free_niov(area);
439+
netmem_ref netmem = net_iov_to_netmem(niov);
440+
441+
net_mp_niov_set_page_pool(pp, niov);
442+
net_mp_netmem_place_in_cache(pp, netmem);
443+
}
444+
spin_unlock_bh(&area->freelist_lock);
445+
}
446+
447+
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
448+
{
449+
struct io_zcrx_ifq *ifq = pp->mp_priv;
450+
451+
/* pp should already be ensuring that */
452+
if (unlikely(pp->alloc.count))
453+
goto out_return;
454+
455+
io_zcrx_ring_refill(pp, ifq);
456+
if (likely(pp->alloc.count))
457+
goto out_return;
458+
459+
io_zcrx_refill_slow(pp, ifq);
460+
if (!pp->alloc.count)
461+
return 0;
462+
out_return:
463+
return pp->alloc.cache[--pp->alloc.count];
464+
}
465+
466+
static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
467+
{
468+
struct net_iov *niov;
469+
470+
if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
471+
return false;
472+
473+
niov = netmem_to_net_iov(netmem);
474+
net_mp_niov_clear_page_pool(niov);
475+
io_zcrx_return_niov_freelist(niov);
476+
return false;
477+
}
478+
479+
static int io_pp_zc_init(struct page_pool *pp)
480+
{
481+
struct io_zcrx_ifq *ifq = pp->mp_priv;
482+
483+
if (WARN_ON_ONCE(!ifq))
484+
return -EINVAL;
485+
if (pp->dma_map)
486+
return -EOPNOTSUPP;
487+
if (pp->p.order != 0)
488+
return -EOPNOTSUPP;
489+
490+
percpu_ref_get(&ifq->ctx->refs);
491+
return 0;
492+
}
493+
494+
static void io_pp_zc_destroy(struct page_pool *pp)
495+
{
496+
struct io_zcrx_ifq *ifq = pp->mp_priv;
497+
struct io_zcrx_area *area = ifq->area;
498+
499+
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
500+
return;
501+
percpu_ref_put(&ifq->ctx->refs);
502+
}
503+
504+
static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
505+
struct netdev_rx_queue *rxq)
506+
{
507+
struct nlattr *nest;
508+
int type;
509+
510+
type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
511+
nest = nla_nest_start(rsp, type);
512+
if (!nest)
513+
return -EMSGSIZE;
514+
nla_nest_end(rsp, nest);
515+
516+
return 0;
517+
}
518+
519+
static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
520+
{
521+
struct pp_memory_provider_params *p = &rxq->mp_params;
522+
struct io_zcrx_ifq *ifq = mp_priv;
523+
524+
io_zcrx_drop_netdev(ifq);
525+
p->mp_ops = NULL;
526+
p->mp_priv = NULL;
527+
}
528+
529+
static const struct memory_provider_ops io_uring_pp_zc_ops = {
530+
.alloc_netmems = io_pp_zc_alloc_netmems,
531+
.release_netmem = io_pp_zc_release_netmem,
532+
.init = io_pp_zc_init,
533+
.destroy = io_pp_zc_destroy,
534+
.nl_fill = io_pp_nl_fill,
535+
.uninstall = io_pp_uninstall,
536+
};

io_uring/zcrx.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
struct io_zcrx_area {
1010
struct net_iov_area nia;
1111
struct io_zcrx_ifq *ifq;
12+
atomic_t *user_refs;
1213

1314
u16 area_id;
1415
struct page **pages;
@@ -26,6 +27,8 @@ struct io_zcrx_ifq {
2627
struct io_uring *rq_ring;
2728
struct io_uring_zcrx_rqe *rqes;
2829
u32 rq_entries;
30+
u32 cached_rq_head;
31+
spinlock_t rq_lock;
2932

3033
u32 if_rxq;
3134
struct device *dev;

0 commit comments

Comments
 (0)