22#include <linux/kernel.h>
33#include <linux/errno.h>
44#include <linux/mm.h>
5+ #include <linux/nospec.h>
56#include <linux/io_uring.h>
67#include <linux/netdevice.h>
78#include <linux/rtnetlink.h>
89
10+ #include <net/page_pool/helpers.h>
11+ #include <net/page_pool/memory_provider.h>
12+ #include <net/netlink.h>
13+
14+ #include <trace/events/page_pool.h>
15+
916#include <uapi/linux/io_uring.h>
1017
1118#include "io_uring.h"
1623
1724#define IO_RQ_MAX_ENTRIES 32768
1825
26+ __maybe_unused
27+ static const struct memory_provider_ops io_uring_pp_zc_ops ;
28+
29+ static inline struct io_zcrx_area * io_zcrx_iov_to_area (const struct net_iov * niov )
30+ {
31+ struct net_iov_area * owner = net_iov_owner (niov );
32+
33+ return container_of (owner , struct io_zcrx_area , nia );
34+ }
35+
36+ static inline atomic_t * io_get_user_counter (struct net_iov * niov )
37+ {
38+ struct io_zcrx_area * area = io_zcrx_iov_to_area (niov );
39+
40+ return & area -> user_refs [net_iov_idx (niov )];
41+ }
42+
43+ static bool io_zcrx_put_niov_uref (struct net_iov * niov )
44+ {
45+ atomic_t * uref = io_get_user_counter (niov );
46+
47+ if (unlikely (!atomic_read (uref )))
48+ return false;
49+ atomic_dec (uref );
50+ return true;
51+ }
52+
1953static int io_allocate_rbuf_ring (struct io_zcrx_ifq * ifq ,
2054 struct io_uring_zcrx_ifq_reg * reg ,
2155 struct io_uring_region_desc * rd )
@@ -51,6 +85,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
5185{
5286 kvfree (area -> freelist );
5387 kvfree (area -> nia .niovs );
88+ kvfree (area -> user_refs );
5489 if (area -> pages ) {
5590 unpin_user_pages (area -> pages , area -> nia .num_niovs );
5691 kvfree (area -> pages );
@@ -106,6 +141,19 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
106141 for (i = 0 ; i < nr_pages ; i ++ )
107142 area -> freelist [i ] = i ;
108143
144+ area -> user_refs = kvmalloc_array (nr_pages , sizeof (area -> user_refs [0 ]),
145+ GFP_KERNEL | __GFP_ZERO );
146+ if (!area -> user_refs )
147+ goto err ;
148+
149+ for (i = 0 ; i < nr_pages ; i ++ ) {
150+ struct net_iov * niov = & area -> nia .niovs [i ];
151+
152+ niov -> owner = & area -> nia ;
153+ area -> freelist [i ] = i ;
154+ atomic_set (& area -> user_refs [i ], 0 );
155+ }
156+
109157 area -> free_count = nr_pages ;
110158 area -> ifq = ifq ;
111159 /* we're only supporting one area per ifq for now */
@@ -131,6 +179,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
131179 ifq -> if_rxq = -1 ;
132180 ifq -> ctx = ctx ;
133181 spin_lock_init (& ifq -> lock );
182+ spin_lock_init (& ifq -> rq_lock );
134183 return ifq ;
135184}
136185
@@ -256,7 +305,232 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
256305 io_zcrx_ifq_free (ifq );
257306}
258307
308+ static struct net_iov * __io_zcrx_get_free_niov (struct io_zcrx_area * area )
309+ {
310+ unsigned niov_idx ;
311+
312+ lockdep_assert_held (& area -> freelist_lock );
313+
314+ niov_idx = area -> freelist [-- area -> free_count ];
315+ return & area -> nia .niovs [niov_idx ];
316+ }
317+
318+ static void io_zcrx_return_niov_freelist (struct net_iov * niov )
319+ {
320+ struct io_zcrx_area * area = io_zcrx_iov_to_area (niov );
321+
322+ spin_lock_bh (& area -> freelist_lock );
323+ area -> freelist [area -> free_count ++ ] = net_iov_idx (niov );
324+ spin_unlock_bh (& area -> freelist_lock );
325+ }
326+
327+ static void io_zcrx_return_niov (struct net_iov * niov )
328+ {
329+ netmem_ref netmem = net_iov_to_netmem (niov );
330+
331+ page_pool_put_unrefed_netmem (niov -> pp , netmem , -1 , false);
332+ }
333+
334+ static void io_zcrx_scrub (struct io_zcrx_ifq * ifq )
335+ {
336+ struct io_zcrx_area * area = ifq -> area ;
337+ int i ;
338+
339+ if (!area )
340+ return ;
341+
342+ /* Reclaim back all buffers given to the user space. */
343+ for (i = 0 ; i < area -> nia .num_niovs ; i ++ ) {
344+ struct net_iov * niov = & area -> nia .niovs [i ];
345+ int nr ;
346+
347+ if (!atomic_read (io_get_user_counter (niov )))
348+ continue ;
349+ nr = atomic_xchg (io_get_user_counter (niov ), 0 );
350+ if (nr && !page_pool_unref_netmem (net_iov_to_netmem (niov ), nr ))
351+ io_zcrx_return_niov (niov );
352+ }
353+ }
354+
259355void io_shutdown_zcrx_ifqs (struct io_ring_ctx * ctx )
260356{
261357 lockdep_assert_held (& ctx -> uring_lock );
358+
359+ if (ctx -> ifq )
360+ io_zcrx_scrub (ctx -> ifq );
361+ }
362+
363+ static inline u32 io_zcrx_rqring_entries (struct io_zcrx_ifq * ifq )
364+ {
365+ u32 entries ;
366+
367+ entries = smp_load_acquire (& ifq -> rq_ring -> tail ) - ifq -> cached_rq_head ;
368+ return min (entries , ifq -> rq_entries );
262369}
370+
371+ static struct io_uring_zcrx_rqe * io_zcrx_get_rqe (struct io_zcrx_ifq * ifq ,
372+ unsigned mask )
373+ {
374+ unsigned int idx = ifq -> cached_rq_head ++ & mask ;
375+
376+ return & ifq -> rqes [idx ];
377+ }
378+
379+ static void io_zcrx_ring_refill (struct page_pool * pp ,
380+ struct io_zcrx_ifq * ifq )
381+ {
382+ unsigned int mask = ifq -> rq_entries - 1 ;
383+ unsigned int entries ;
384+ netmem_ref netmem ;
385+
386+ spin_lock_bh (& ifq -> rq_lock );
387+
388+ entries = io_zcrx_rqring_entries (ifq );
389+ entries = min_t (unsigned , entries , PP_ALLOC_CACHE_REFILL - pp -> alloc .count );
390+ if (unlikely (!entries )) {
391+ spin_unlock_bh (& ifq -> rq_lock );
392+ return ;
393+ }
394+
395+ do {
396+ struct io_uring_zcrx_rqe * rqe = io_zcrx_get_rqe (ifq , mask );
397+ struct io_zcrx_area * area ;
398+ struct net_iov * niov ;
399+ unsigned niov_idx , area_idx ;
400+
401+ area_idx = rqe -> off >> IORING_ZCRX_AREA_SHIFT ;
402+ niov_idx = (rqe -> off & ~IORING_ZCRX_AREA_MASK ) >> PAGE_SHIFT ;
403+
404+ if (unlikely (rqe -> __pad || area_idx ))
405+ continue ;
406+ area = ifq -> area ;
407+
408+ if (unlikely (niov_idx >= area -> nia .num_niovs ))
409+ continue ;
410+ niov_idx = array_index_nospec (niov_idx , area -> nia .num_niovs );
411+
412+ niov = & area -> nia .niovs [niov_idx ];
413+ if (!io_zcrx_put_niov_uref (niov ))
414+ continue ;
415+
416+ netmem = net_iov_to_netmem (niov );
417+ if (page_pool_unref_netmem (netmem , 1 ) != 0 )
418+ continue ;
419+
420+ if (unlikely (niov -> pp != pp )) {
421+ io_zcrx_return_niov (niov );
422+ continue ;
423+ }
424+
425+ net_mp_netmem_place_in_cache (pp , netmem );
426+ } while (-- entries );
427+
428+ smp_store_release (& ifq -> rq_ring -> head , ifq -> cached_rq_head );
429+ spin_unlock_bh (& ifq -> rq_lock );
430+ }
431+
432+ static void io_zcrx_refill_slow (struct page_pool * pp , struct io_zcrx_ifq * ifq )
433+ {
434+ struct io_zcrx_area * area = ifq -> area ;
435+
436+ spin_lock_bh (& area -> freelist_lock );
437+ while (area -> free_count && pp -> alloc .count < PP_ALLOC_CACHE_REFILL ) {
438+ struct net_iov * niov = __io_zcrx_get_free_niov (area );
439+ netmem_ref netmem = net_iov_to_netmem (niov );
440+
441+ net_mp_niov_set_page_pool (pp , niov );
442+ net_mp_netmem_place_in_cache (pp , netmem );
443+ }
444+ spin_unlock_bh (& area -> freelist_lock );
445+ }
446+
447+ static netmem_ref io_pp_zc_alloc_netmems (struct page_pool * pp , gfp_t gfp )
448+ {
449+ struct io_zcrx_ifq * ifq = pp -> mp_priv ;
450+
451+ /* pp should already be ensuring that */
452+ if (unlikely (pp -> alloc .count ))
453+ goto out_return ;
454+
455+ io_zcrx_ring_refill (pp , ifq );
456+ if (likely (pp -> alloc .count ))
457+ goto out_return ;
458+
459+ io_zcrx_refill_slow (pp , ifq );
460+ if (!pp -> alloc .count )
461+ return 0 ;
462+ out_return :
463+ return pp -> alloc .cache [-- pp -> alloc .count ];
464+ }
465+
466+ static bool io_pp_zc_release_netmem (struct page_pool * pp , netmem_ref netmem )
467+ {
468+ struct net_iov * niov ;
469+
470+ if (WARN_ON_ONCE (!netmem_is_net_iov (netmem )))
471+ return false;
472+
473+ niov = netmem_to_net_iov (netmem );
474+ net_mp_niov_clear_page_pool (niov );
475+ io_zcrx_return_niov_freelist (niov );
476+ return false;
477+ }
478+
479+ static int io_pp_zc_init (struct page_pool * pp )
480+ {
481+ struct io_zcrx_ifq * ifq = pp -> mp_priv ;
482+
483+ if (WARN_ON_ONCE (!ifq ))
484+ return - EINVAL ;
485+ if (pp -> dma_map )
486+ return - EOPNOTSUPP ;
487+ if (pp -> p .order != 0 )
488+ return - EOPNOTSUPP ;
489+
490+ percpu_ref_get (& ifq -> ctx -> refs );
491+ return 0 ;
492+ }
493+
494+ static void io_pp_zc_destroy (struct page_pool * pp )
495+ {
496+ struct io_zcrx_ifq * ifq = pp -> mp_priv ;
497+ struct io_zcrx_area * area = ifq -> area ;
498+
499+ if (WARN_ON_ONCE (area -> free_count != area -> nia .num_niovs ))
500+ return ;
501+ percpu_ref_put (& ifq -> ctx -> refs );
502+ }
503+
504+ static int io_pp_nl_fill (void * mp_priv , struct sk_buff * rsp ,
505+ struct netdev_rx_queue * rxq )
506+ {
507+ struct nlattr * nest ;
508+ int type ;
509+
510+ type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING ;
511+ nest = nla_nest_start (rsp , type );
512+ if (!nest )
513+ return - EMSGSIZE ;
514+ nla_nest_end (rsp , nest );
515+
516+ return 0 ;
517+ }
518+
519+ static void io_pp_uninstall (void * mp_priv , struct netdev_rx_queue * rxq )
520+ {
521+ struct pp_memory_provider_params * p = & rxq -> mp_params ;
522+ struct io_zcrx_ifq * ifq = mp_priv ;
523+
524+ io_zcrx_drop_netdev (ifq );
525+ p -> mp_ops = NULL ;
526+ p -> mp_priv = NULL ;
527+ }
528+
529+ static const struct memory_provider_ops io_uring_pp_zc_ops = {
530+ .alloc_netmems = io_pp_zc_alloc_netmems ,
531+ .release_netmem = io_pp_zc_release_netmem ,
532+ .init = io_pp_zc_init ,
533+ .destroy = io_pp_zc_destroy ,
534+ .nl_fill = io_pp_nl_fill ,
535+ .uninstall = io_pp_uninstall ,
536+ };
0 commit comments