From 6e4793492ebff266378c134fd1f2ca935619dedf Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 7 Nov 2025 12:28:45 +0200 Subject: [PATCH 1/2] page_pool: add benchmarking for napi-based recycling The code brings back the tasklet based code in order to be able to run in softirq context. One additional test is added which benchmarks the impact of page_pool_napi_local(). Signed-off-by: Dragos Tatulea --- .../bench/page_pool/bench_page_pool_simple.c | 92 ++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c index cb6468adbda4d..84683c5478146 100644 --- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "time_bench.h" @@ -16,6 +17,8 @@ static int verbose = 1; #define MY_POOL_SIZE 1024 +DEFINE_MUTEX(wait_for_tasklet); + /* Makes tests selectable. Useful for perf-record to analyze a single test. * Hint: Bash shells support writing binary number like: $((2#101010) * @@ -31,6 +34,10 @@ enum benchmark_bit { bit_run_bench_no_softirq01, bit_run_bench_no_softirq02, bit_run_bench_no_softirq03, + bit_run_bench_tasklet01, + bit_run_bench_tasklet02, + bit_run_bench_tasklet03, + bit_run_bench_tasklet04, }; #define bit(b) (1 << (b)) @@ -120,7 +127,12 @@ static void pp_fill_ptr_ring(struct page_pool *pp, int elems) kfree(array); } -enum test_type { type_fast_path, type_ptr_ring, type_page_allocator }; +enum test_type { + type_fast_path, + type_napi_aware, + type_ptr_ring, + type_page_allocator, +}; /* Depends on compile optimizing this function */ static int time_bench_page_pool(struct time_bench_record *rec, void *data, @@ -132,6 +144,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, struct page_pool *pp; struct page *page; + struct napi_struct napi = {0}; struct page_pool_params pp_params = { .order = 0, @@ -141,6 +154,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, .dev = NULL, /* Only use for DMA mapping */ .dma_dir = DMA_BIDIRECTIONAL, }; + struct page_pool_stats stats = {0}; pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { @@ -155,6 +169,11 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, else pr_warn("%s(): Cannot use page_pool fast-path\n", func); + if (type == type_napi_aware) { + napi.list_owner = smp_processor_id(); + page_pool_enable_direct_recycling(pp, &napi); + } + time_bench_start(rec); /** Loop to measure **/ for (i = 0; i < rec->loops; i++) { @@ -173,7 +192,13 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, page_pool_recycle_direct(pp, page); } else if (type == type_ptr_ring) { - /* Normal return path */ + /* Normal return path, either direct or via ptr_ring */ + page_pool_put_page(pp, page, -1, false); + + } else if (type == type_napi_aware) { + /* NAPI-aware recycling: uses fast-path recycling if + * possible. + */ page_pool_put_page(pp, page, -1, false); } else if (type == type_page_allocator) { @@ -188,6 +213,14 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, } } time_bench_stop(rec, loops_cnt); + + if (type == type_napi_aware) { + page_pool_get_stats(pp, &stats); + if (stats.recycle_stats.cached < rec->loops) + pr_warn("%s(): NAPI-aware recycling wasn't used\n", + func); + } + out: page_pool_destroy(pp); return loops_cnt; @@ -211,6 +244,54 @@ static int time_bench_page_pool03_slow(struct time_bench_record *rec, return time_bench_page_pool(rec, data, type_page_allocator, __func__); } +static int time_bench_page_pool04_napi_aware(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_napi_aware, __func__); +} + +/* Testing page_pool requires running under softirq. + * + * Running under a tasklet satisfy this, as tasklets are built on top of + * softirq. + */ +static void pp_tasklet_handler(struct tasklet_struct *t) +{ + uint32_t nr_loops = loops; + + if (in_serving_softirq()) + pr_warn("%s(): in_serving_softirq fast-path\n", + __func__); // True + else + pr_warn("%s(): Cannot use page_pool fast-path\n", __func__); + + if (enabled(bit_run_bench_tasklet01)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path", + NULL, time_bench_page_pool01_fast_path); + + if (enabled(bit_run_bench_tasklet02)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring", + NULL, time_bench_page_pool02_ptr_ring); + + if (enabled(bit_run_bench_tasklet03)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL, + time_bench_page_pool03_slow); + + if (enabled(bit_run_bench_tasklet04)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware", + NULL, time_bench_page_pool04_napi_aware); + + mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */ +} +DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler); + +static void run_tasklet_tests(void) +{ + tasklet_enable(&pp_tasklet); + /* "Async" schedule tasklet, which runs on the CPU that schedule it */ + tasklet_schedule(&pp_tasklet); +} + static int run_benchmark_tests(void) { uint32_t nr_loops = loops; @@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void) run_benchmark_tests(); + mutex_lock(&wait_for_tasklet); + run_tasklet_tests(); + /* Sleep on mutex, waiting for tasklet to release */ + mutex_lock(&wait_for_tasklet); + return 0; } module_init(bench_page_pool_simple_module_init); static void __exit bench_page_pool_simple_module_exit(void) { + tasklet_kill(&pp_tasklet); + if (verbose) pr_info("Unloaded\n"); } From ca93a678cef152e55d85a4fa60d9aec3f7360293 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 7 Nov 2025 12:28:46 +0200 Subject: [PATCH 2/2] xdp: Delegate fast path return decision to page_pool XDP uses the BPF_RI_F_RF_NO_DIRECT flag to mark contexts where it is not allowed to do direct recycling, even though the direct flag was set by the caller. This is confusing and can lead to races which are hard to detect [1]. Furthermore, the page_pool already contains an internal mechanism which checks if it is safe to switch the direct flag from off to on. This patch drops the use of the BPF_RI_F_RF_NO_DIRECT flag and always calls the page_pool release with the direct flag set to false. The page_pool will decide if it is safe to do direct recycling. This is not free but it is worth it to make the XDP code safer. The next paragrapsh are discussing the performance impact. Performance wise, there are 3 cases to consider. Looking from __xdp_return() for MEM_TYPE_PAGE_POOL case: 1) napi_direct == false: - Before: 1 comparison in __xdp_return() + call of page_pool_napi_local() from page_pool_put_unrefed_netmem(). - After: Only one call to page_pool_napi_local(). 2) napi_direct == true && BPF_RI_F_RF_NO_DIRECT - Before: 2 comparisons in __xdp_return() + call of page_pool_napi_local() from page_pool_put_unrefed_netmem(). - After: Only one call to page_pool_napi_local(). 3) napi_direct == true && !BPF_RI_F_RF_NO_DIRECT - Before: 2 comparisons in __xdp_return(). - After: One call to page_pool_napi_local() Case 1 & 2 are the slower paths and they only have to gain. But they are slow anyway so the gain is small. Case 3 is the fast path and is the one that has to be considered more closely. The 2 comparisons from __xdp_return() are swapped for the more expensive page_pool_napi_local() call. Using the page_pool benchmark between the fast-path and the newly-added NAPI aware mode to measure [2] how expensive page_pool_napi_local() is: bench_page_pool: time_bench_page_pool01_fast_path(): in_serving_softirq fast-path bench_page_pool: Type:tasklet_page_pool01_fast_path Per elem: 15 cycles(tsc) 7.537 ns (step:0) bench_page_pool: time_bench_page_pool04_napi_aware(): in_serving_softirq fast-path bench_page_pool: Type:tasklet_page_pool04_napi_aware Per elem: 20 cycles(tsc) 10.490 ns (step:0) ... and the slow path for reference: bench_page_pool: time_bench_page_pool02_ptr_ring(): in_serving_softirq fast-path bench_page_pool: Type:tasklet_page_pool02_ptr_ring Per elem: 30 cycles(tsc) 15.395 ns (step:0) So the impact is small in the fast-path, but not negligible. One thing to consider is the fact that the comparisons from napi_direct are dropped. That means that the impact will be smaller than the measurements from the benchmark. [1] Commit 2b986b9e917b ("bpf, cpumap: Disable page_pool direct xdp_return need larger scope") [2] Intel Xeon Platinum 8580 Signed-off-by: Dragos Tatulea --- drivers/net/veth.c | 2 -- include/linux/filter.h | 22 ---------------------- include/net/xdp.h | 2 +- kernel/bpf/cpumap.c | 2 -- net/bpf/test_run.c | 2 -- net/core/filter.c | 2 +- net/core/xdp.c | 24 ++++++++++++------------ 7 files changed, 14 insertions(+), 42 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a3046142cb8e2..6d5c1e0b05a7c 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -975,7 +975,6 @@ static int veth_poll(struct napi_struct *napi, int budget) bq.count = 0; - xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); if (stats.xdp_redirect > 0) @@ -994,7 +993,6 @@ static int veth_poll(struct napi_struct *napi, int budget) if (stats.xdp_tx > 0) veth_xdp_flush(rq, &bq); - xdp_clear_return_frame_no_direct(); return done; } diff --git a/include/linux/filter.h b/include/linux/filter.h index f5c859b8131a3..877e40d81a4c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -764,7 +764,6 @@ struct bpf_nh_params { }; /* flags for bpf_redirect_info kern_flags */ -#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ #define BPF_RI_F_RI_INIT BIT(1) #define BPF_RI_F_CPU_MAP_INIT BIT(2) #define BPF_RI_F_DEV_MAP_INIT BIT(3) @@ -1163,27 +1162,6 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); -static inline bool xdp_return_frame_no_direct(void) -{ - struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); - - return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; -} - -static inline void xdp_set_return_frame_no_direct(void) -{ - struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); - - ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; -} - -static inline void xdp_clear_return_frame_no_direct(void) -{ - struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); - - ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; -} - static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { diff --git a/include/net/xdp.h b/include/net/xdp.h index aa742f413c358..2a44d84a7611d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -446,7 +446,7 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) } void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, - bool napi_direct, struct xdp_buff *xdp); + struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 703e5df1f4ef9..3ece03dc36bdc 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -253,7 +253,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -263,7 +262,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); - xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 655efac6f1334..a349b976f819d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -289,7 +289,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); ri = bpf_net_ctx_get_ri(); - xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { page = page_pool_dev_alloc_pages(xdp->pp); @@ -352,7 +351,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, err = ret; } - xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return err; diff --git a/net/core/filter.c b/net/core/filter.c index 1efec0d70d783..57c587e910ec3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4195,7 +4195,7 @@ static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, } if (release) { - __xdp_return(netmem, mem_type, false, zc_frag); + __xdp_return(netmem, mem_type, zc_frag); } else { if (!tail) skb_frag_off_add(frag, shrink); diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a9..cf8eab699d9a3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -431,18 +431,18 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); * of xdp_frames/pages in those cases. */ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, - bool napi_direct, struct xdp_buff *xdp) + struct xdp_buff *xdp) { switch (mem_type) { case MEM_TYPE_PAGE_POOL: netmem = netmem_compound_head(netmem); - if (napi_direct && xdp_return_frame_no_direct()) - napi_direct = false; + /* No need to check netmem_is_pp() as mem->type knows this a * page_pool page + * + * page_pool can detect direct recycle. */ - page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, - napi_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); break; case MEM_TYPE_PAGE_SHARED: page_frag_free(__netmem_address(netmem)); @@ -471,10 +471,10 @@ void xdp_return_frame(struct xdp_frame *xdpf) sinfo = xdp_get_shared_info_from_frame(xdpf); for (u32 i = 0; i < sinfo->nr_frags; i++) __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, - false, NULL); + NULL); out: - __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); @@ -488,10 +488,10 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) sinfo = xdp_get_shared_info_from_frame(xdpf); for (u32 i = 0; i < sinfo->nr_frags; i++) __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, - true, NULL); + NULL); out: - __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -542,7 +542,7 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); */ void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) { - __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); + __xdp_return(netmem, xdp->rxq->mem.type, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frag); @@ -556,10 +556,10 @@ void xdp_return_buff(struct xdp_buff *xdp) sinfo = xdp_get_shared_info_from_buff(xdp); for (u32 i = 0; i < sinfo->nr_frags; i++) __xdp_return(skb_frag_netmem(&sinfo->frags[i]), - xdp->rxq->mem.type, true, xdp); + xdp->rxq->mem.type, xdp); out: - __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff);