From 6e4793492ebff266378c134fd1f2ca935619dedf Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Fri, 7 Nov 2025 12:28:45 +0200
Subject: [PATCH 1/2] page_pool: add benchmarking for napi-based recycling

The code brings back the tasklet based code in order
to be able to run in softirq context.

One additional test is added which benchmarks the
impact of page_pool_napi_local().

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
---
 .../bench/page_pool/bench_page_pool_simple.c  | 92 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
index cb6468adbda4d..84683c5478146 100644
--- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
+++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
@@ -9,6 +9,7 @@
 #include <linux/limits.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/netdevice.h>
 #include <net/page_pool/helpers.h>
 
 #include "time_bench.h"
@@ -16,6 +17,8 @@
 static int verbose = 1;
 #define MY_POOL_SIZE 1024
 
+DEFINE_MUTEX(wait_for_tasklet);
+
 /* Makes tests selectable. Useful for perf-record to analyze a single test.
  * Hint: Bash shells support writing binary number like: $((2#101010)
  *
@@ -31,6 +34,10 @@ enum benchmark_bit {
 	bit_run_bench_no_softirq01,
 	bit_run_bench_no_softirq02,
 	bit_run_bench_no_softirq03,
+	bit_run_bench_tasklet01,
+	bit_run_bench_tasklet02,
+	bit_run_bench_tasklet03,
+	bit_run_bench_tasklet04,
 };
 
 #define bit(b)		(1 << (b))
@@ -120,7 +127,12 @@ static void pp_fill_ptr_ring(struct page_pool *pp, int elems)
 	kfree(array);
 }
 
-enum test_type { type_fast_path, type_ptr_ring, type_page_allocator };
+enum test_type {
+	type_fast_path,
+	type_napi_aware,
+	type_ptr_ring,
+	type_page_allocator,
+};
 
 /* Depends on compile optimizing this function */
 static int time_bench_page_pool(struct time_bench_record *rec, void *data,
@@ -132,6 +144,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 
 	struct page_pool *pp;
 	struct page *page;
+	struct napi_struct napi = {0};
 
 	struct page_pool_params pp_params = {
 		.order = 0,
@@ -141,6 +154,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 		.dev = NULL, /* Only use for DMA mapping */
 		.dma_dir = DMA_BIDIRECTIONAL,
 	};
+	struct page_pool_stats stats = {0};
 
 	pp = page_pool_create(&pp_params);
 	if (IS_ERR(pp)) {
@@ -155,6 +169,11 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 	else
 		pr_warn("%s(): Cannot use page_pool fast-path\n", func);
 
+	if (type == type_napi_aware) {
+		napi.list_owner = smp_processor_id();
+		page_pool_enable_direct_recycling(pp, &napi);
+	}
+
 	time_bench_start(rec);
 	/** Loop to measure **/
 	for (i = 0; i < rec->loops; i++) {
@@ -173,7 +192,13 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 			page_pool_recycle_direct(pp, page);
 
 		} else if (type == type_ptr_ring) {
-			/* Normal return path */
+			/* Normal return path, either direct or via ptr_ring */
+			page_pool_put_page(pp, page, -1, false);
+
+		} else if (type == type_napi_aware) {
+			/* NAPI-aware recycling: uses fast-path recycling if
+			 * possible.
+			 */
 			page_pool_put_page(pp, page, -1, false);
 
 		} else if (type == type_page_allocator) {
@@ -188,6 +213,14 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 		}
 	}
 	time_bench_stop(rec, loops_cnt);
+
+	if (type == type_napi_aware) {
+		page_pool_get_stats(pp, &stats);
+		if (stats.recycle_stats.cached < rec->loops)
+			pr_warn("%s(): NAPI-aware recycling wasn't used\n",
+				func);
+	}
+
 out:
 	page_pool_destroy(pp);
 	return loops_cnt;
@@ -211,6 +244,54 @@ static int time_bench_page_pool03_slow(struct time_bench_record *rec,
 	return time_bench_page_pool(rec, data, type_page_allocator, __func__);
 }
 
+static int time_bench_page_pool04_napi_aware(struct time_bench_record *rec,
+					     void *data)
+{
+	return time_bench_page_pool(rec, data, type_napi_aware, __func__);
+}
+
+/* Testing page_pool requires running under softirq.
+ *
+ * Running under a tasklet satisfy this, as tasklets are built on top of
+ * softirq.
+ */
+static void pp_tasklet_handler(struct tasklet_struct *t)
+{
+	uint32_t nr_loops = loops;
+
+	if (in_serving_softirq())
+		pr_warn("%s(): in_serving_softirq fast-path\n",
+			__func__); // True
+	else
+		pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);
+
+	if (enabled(bit_run_bench_tasklet01))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",
+				NULL, time_bench_page_pool01_fast_path);
+
+	if (enabled(bit_run_bench_tasklet02))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",
+				NULL, time_bench_page_pool02_ptr_ring);
+
+	if (enabled(bit_run_bench_tasklet03))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,
+				time_bench_page_pool03_slow);
+
+	if (enabled(bit_run_bench_tasklet04))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",
+				NULL, time_bench_page_pool04_napi_aware);
+
+	mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
+}
+DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
+
+static void run_tasklet_tests(void)
+{
+	tasklet_enable(&pp_tasklet);
+	/* "Async" schedule tasklet, which runs on the CPU that schedule it */
+	tasklet_schedule(&pp_tasklet);
+}
+
 static int run_benchmark_tests(void)
 {
 	uint32_t nr_loops = loops;
@@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
 
 	run_benchmark_tests();
 
+	mutex_lock(&wait_for_tasklet);
+	run_tasklet_tests();
+	/* Sleep on mutex, waiting for tasklet to release */
+	mutex_lock(&wait_for_tasklet);
+
 	return 0;
 }
 module_init(bench_page_pool_simple_module_init);
 
 static void __exit bench_page_pool_simple_module_exit(void)
 {
+	tasklet_kill(&pp_tasklet);
+
 	if (verbose)
 		pr_info("Unloaded\n");
 }

From ca93a678cef152e55d85a4fa60d9aec3f7360293 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Fri, 7 Nov 2025 12:28:46 +0200
Subject: [PATCH 2/2] xdp: Delegate fast path return decision to page_pool

XDP uses the BPF_RI_F_RF_NO_DIRECT flag to mark contexts where it is not
allowed to do direct recycling, even though the direct flag was set by
the caller. This is confusing and can lead to races which are hard to
detect [1].

Furthermore, the page_pool already contains an internal
mechanism which checks if it is safe to switch the direct
flag from off to on.

This patch drops the use of the BPF_RI_F_RF_NO_DIRECT flag and always
calls the page_pool release with the direct flag set to false. The
page_pool will decide if it is safe to do direct recycling. This
is not free but it is worth it to make the XDP code safer. The
next paragrapsh are discussing the performance impact.

Performance wise, there are 3 cases to consider. Looking from
__xdp_return() for MEM_TYPE_PAGE_POOL case:

1) napi_direct == false:
  - Before: 1 comparison in __xdp_return() + call of
    page_pool_napi_local() from page_pool_put_unrefed_netmem().
  - After: Only one call to page_pool_napi_local().

2) napi_direct == true && BPF_RI_F_RF_NO_DIRECT
  - Before: 2 comparisons in __xdp_return() + call of
    page_pool_napi_local() from page_pool_put_unrefed_netmem().
  - After: Only one call to page_pool_napi_local().

3) napi_direct == true && !BPF_RI_F_RF_NO_DIRECT
  - Before: 2 comparisons in __xdp_return().
  - After: One call to page_pool_napi_local()

Case 1 & 2 are the slower paths and they only have to gain.
But they are slow anyway so the gain is small.

Case 3 is the fast path and is the one that has to be considered more
closely. The 2 comparisons from __xdp_return() are swapped for the more
expensive page_pool_napi_local() call.

Using the page_pool benchmark between the fast-path and the
newly-added NAPI aware mode to measure [2] how expensive
page_pool_napi_local() is:

  bench_page_pool: time_bench_page_pool01_fast_path(): in_serving_softirq fast-path
  bench_page_pool: Type:tasklet_page_pool01_fast_path Per elem: 15 cycles(tsc) 7.537 ns (step:0)

  bench_page_pool: time_bench_page_pool04_napi_aware(): in_serving_softirq fast-path
  bench_page_pool: Type:tasklet_page_pool04_napi_aware Per elem: 20 cycles(tsc) 10.490 ns (step:0)

... and the slow path for reference:

  bench_page_pool: time_bench_page_pool02_ptr_ring(): in_serving_softirq fast-path
  bench_page_pool: Type:tasklet_page_pool02_ptr_ring Per elem: 30 cycles(tsc) 15.395 ns (step:0)

So the impact is small in the fast-path, but not negligible. One thing
to consider is the fact that the comparisons from napi_direct are
dropped. That means that the impact will be smaller than the
measurements from the benchmark.

[1] Commit 2b986b9e917b ("bpf, cpumap: Disable page_pool direct xdp_return need larger scope")
[2] Intel Xeon Platinum 8580

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
---
 drivers/net/veth.c     |  2 --
 include/linux/filter.h | 22 ----------------------
 include/net/xdp.h      |  2 +-
 kernel/bpf/cpumap.c    |  2 --
 net/bpf/test_run.c     |  2 --
 net/core/filter.c      |  2 +-
 net/core/xdp.c         | 24 ++++++++++++------------
 7 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a3046142cb8e2..6d5c1e0b05a7c 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -975,7 +975,6 @@ static int veth_poll(struct napi_struct *napi, int budget)
 
 	bq.count = 0;
 
-	xdp_set_return_frame_no_direct();
 	done = veth_xdp_rcv(rq, budget, &bq, &stats);
 
 	if (stats.xdp_redirect > 0)
@@ -994,7 +993,6 @@ static int veth_poll(struct napi_struct *napi, int budget)
 
 	if (stats.xdp_tx > 0)
 		veth_xdp_flush(rq, &bq);
-	xdp_clear_return_frame_no_direct();
 
 	return done;
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5c859b8131a3..877e40d81a4c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -764,7 +764,6 @@ struct bpf_nh_params {
 };
 
 /* flags for bpf_redirect_info kern_flags */
-#define BPF_RI_F_RF_NO_DIRECT	BIT(0)	/* no napi_direct on return_frame */
 #define BPF_RI_F_RI_INIT	BIT(1)
 #define BPF_RI_F_CPU_MAP_INIT	BIT(2)
 #define BPF_RI_F_DEV_MAP_INIT	BIT(3)
@@ -1163,27 +1162,6 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
 
-static inline bool xdp_return_frame_no_direct(void)
-{
-	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
-	return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
-}
-
-static inline void xdp_set_return_frame_no_direct(void)
-{
-	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
-	ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
-}
-
-static inline void xdp_clear_return_frame_no_direct(void)
-{
-	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
-	ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
-}
-
 static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
 				 unsigned int pktlen)
 {
diff --git a/include/net/xdp.h b/include/net/xdp.h
index aa742f413c358..2a44d84a7611d 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -446,7 +446,7 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
 }
 
 void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
-		  bool napi_direct, struct xdp_buff *xdp);
+		  struct xdp_buff *xdp);
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 703e5df1f4ef9..3ece03dc36bdc 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -253,7 +253,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 
 	rcu_read_lock();
 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
-	xdp_set_return_frame_no_direct();
 
 	ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
 	if (unlikely(ret->skb_n))
@@ -263,7 +262,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 	if (stats->redirect)
 		xdp_do_flush();
 
-	xdp_clear_return_frame_no_direct();
 	bpf_net_ctx_clear(bpf_net_ctx);
 	rcu_read_unlock();
 
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 655efac6f1334..a349b976f819d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -289,7 +289,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 	local_bh_disable();
 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 	ri = bpf_net_ctx_get_ri();
-	xdp_set_return_frame_no_direct();
 
 	for (i = 0; i < batch_sz; i++) {
 		page = page_pool_dev_alloc_pages(xdp->pp);
@@ -352,7 +351,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 			err = ret;
 	}
 
-	xdp_clear_return_frame_no_direct();
 	bpf_net_ctx_clear(bpf_net_ctx);
 	local_bh_enable();
 	return err;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1efec0d70d783..57c587e910ec3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4195,7 +4195,7 @@ static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
 	}
 
 	if (release) {
-		__xdp_return(netmem, mem_type, false, zc_frag);
+		__xdp_return(netmem, mem_type, zc_frag);
 	} else {
 		if (!tail)
 			skb_frag_off_add(frag, shrink);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9100e160113a9..cf8eab699d9a3 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -431,18 +431,18 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
  * of xdp_frames/pages in those cases.
  */
 void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
-		  bool napi_direct, struct xdp_buff *xdp)
+		  struct xdp_buff *xdp)
 {
 	switch (mem_type) {
 	case MEM_TYPE_PAGE_POOL:
 		netmem = netmem_compound_head(netmem);
-		if (napi_direct && xdp_return_frame_no_direct())
-			napi_direct = false;
+
 		/* No need to check netmem_is_pp() as mem->type knows this a
 		 * page_pool page
+		 *
+		 * page_pool can detect direct recycle.
 		 */
-		page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
-					  napi_direct);
+		page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
 		break;
 	case MEM_TYPE_PAGE_SHARED:
 		page_frag_free(__netmem_address(netmem));
@@ -471,10 +471,10 @@ void xdp_return_frame(struct xdp_frame *xdpf)
 	sinfo = xdp_get_shared_info_from_frame(xdpf);
 	for (u32 i = 0; i < sinfo->nr_frags; i++)
 		__xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
-			     false, NULL);
+			     NULL);
 
 out:
-	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
+	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
@@ -488,10 +488,10 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 	sinfo = xdp_get_shared_info_from_frame(xdpf);
 	for (u32 i = 0; i < sinfo->nr_frags; i++)
 		__xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
-			     true, NULL);
+			     NULL);
 
 out:
-	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
+	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
@@ -542,7 +542,7 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
  */
 void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
 {
-	__xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+	__xdp_return(netmem, xdp->rxq->mem.type, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frag);
 
@@ -556,10 +556,10 @@ void xdp_return_buff(struct xdp_buff *xdp)
 	sinfo = xdp_get_shared_info_from_buff(xdp);
 	for (u32 i = 0; i < sinfo->nr_frags; i++)
 		__xdp_return(skb_frag_netmem(&sinfo->frags[i]),
-			     xdp->rxq->mem.type, true, xdp);
+			     xdp->rxq->mem.type, xdp);
 
 out:
-	__xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
+	__xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, xdp);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);