xdp: Delegate fast path return decision to page_pool

dtatulea · Kernel Patches Daemon · commit ca93a678cef1 · 2025-11-07T02:35:39.000-08:00
XDP uses the BPF_RI_F_RF_NO_DIRECT flag to mark contexts where it is not allowed to do direct recycling, even though the direct flag was set by the caller. This is confusing and can lead to races which are hard to detect [1]. Furthermore, the page_pool already contains an internal mechanism which checks if it is safe to switch the direct flag from off to on. This patch drops the use of the BPF_RI_F_RF_NO_DIRECT flag and always calls the page_pool release with the direct flag set to false. The page_pool will decide if it is safe to do direct recycling. This is not free but it is worth it to make the XDP code safer. The next paragrapsh are discussing the performance impact. Performance wise, there are 3 cases to consider. Looking from __xdp_return() for MEM_TYPE_PAGE_POOL case: 1) napi_direct == false: - Before: 1 comparison in __xdp_return() + call of page_pool_napi_local() from page_pool_put_unrefed_netmem(). - After: Only one call to page_pool_napi_local(). 2) napi_direct == true && BPF_RI_F_RF_NO_DIRECT - Before: 2 comparisons in __xdp_return() + call of page_pool_napi_local() from page_pool_put_unrefed_netmem(). - After: Only one call to page_pool_napi_local(). 3) napi_direct == true && !BPF_RI_F_RF_NO_DIRECT - Before: 2 comparisons in __xdp_return(). - After: One call to page_pool_napi_local() Case 1 & 2 are the slower paths and they only have to gain. But they are slow anyway so the gain is small. Case 3 is the fast path and is the one that has to be considered more closely. The 2 comparisons from __xdp_return() are swapped for the more expensive page_pool_napi_local() call. Using the page_pool benchmark between the fast-path and the newly-added NAPI aware mode to measure [2] how expensive page_pool_napi_local() is: bench_page_pool: time_bench_page_pool01_fast_path(): in_serving_softirq fast-path bench_page_pool: Type:tasklet_page_pool01_fast_path Per elem: 15 cycles(tsc) 7.537 ns (step:0) bench_page_pool: time_bench_page_pool04_napi_aware(): in_serving_softirq fast-path bench_page_pool: Type:tasklet_page_pool04_napi_aware Per elem: 20 cycles(tsc) 10.490 ns (step:0) ... and the slow path for reference: bench_page_pool: time_bench_page_pool02_ptr_ring(): in_serving_softirq fast-path bench_page_pool: Type:tasklet_page_pool02_ptr_ring Per elem: 30 cycles(tsc) 15.395 ns (step:0) So the impact is small in the fast-path, but not negligible. One thing to consider is the fact that the comparisons from napi_direct are dropped. That means that the impact will be smaller than the measurements from the benchmark. [1] Commit 2b986b9 ("bpf, cpumap: Disable page_pool direct xdp_return need larger scope") [2] Intel Xeon Platinum 8580 Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
@@ -975,7 +975,6 @@ static int veth_poll(struct napi_struct *napi, int budget)
 
 	bq.count = 0;
 
-	xdp_set_return_frame_no_direct();
 	done = veth_xdp_rcv(rq, budget, &bq, &stats);
 
 	if (stats.xdp_redirect > 0)
@@ -994,7 +993,6 @@ static int veth_poll(struct napi_struct *napi, int budget)
 
 	if (stats.xdp_tx > 0)
 		veth_xdp_flush(rq, &bq);
-	xdp_clear_return_frame_no_direct();
 
 	return done;
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
@@ -764,7 +764,6 @@ struct bpf_nh_params {
 };
 
 /* flags for bpf_redirect_info kern_flags */
-#define BPF_RI_F_RF_NO_DIRECT	BIT(0)	/* no napi_direct on return_frame */
 #define BPF_RI_F_RI_INIT	BIT(1)
 #define BPF_RI_F_CPU_MAP_INIT	BIT(2)
 #define BPF_RI_F_DEV_MAP_INIT	BIT(3)
@@ -1163,27 +1162,6 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
 
-static inline bool xdp_return_frame_no_direct(void)
-{
-	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
-	return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
-}
-
-static inline void xdp_set_return_frame_no_direct(void)
-{
-	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
-	ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
-}
-
-static inline void xdp_clear_return_frame_no_direct(void)
-{
-	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
-	ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
-}
-
 static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
 				 unsigned int pktlen)
 {
diff --git a/include/net/xdp.h b/include/net/xdp.h
@@ -446,7 +446,7 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
 }
 
 void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
-		  bool napi_direct, struct xdp_buff *xdp);
+		  struct xdp_buff *xdp);
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
@@ -253,7 +253,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 
 	rcu_read_lock();
 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
-	xdp_set_return_frame_no_direct();
 
 	ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
 	if (unlikely(ret->skb_n))
@@ -263,7 +262,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 	if (stats->redirect)
 		xdp_do_flush();
 
-	xdp_clear_return_frame_no_direct();
 	bpf_net_ctx_clear(bpf_net_ctx);
 	rcu_read_unlock();
 
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
@@ -289,7 +289,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 	local_bh_disable();
 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 	ri = bpf_net_ctx_get_ri();
-	xdp_set_return_frame_no_direct();
 
 	for (i = 0; i < batch_sz; i++) {
 		page = page_pool_dev_alloc_pages(xdp->pp);
@@ -352,7 +351,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 			err = ret;
 	}
 
-	xdp_clear_return_frame_no_direct();
 	bpf_net_ctx_clear(bpf_net_ctx);
 	local_bh_enable();
 	return err;
diff --git a/net/core/filter.c b/net/core/filter.c
@@ -4195,7 +4195,7 @@ static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
 	}
 
 	if (release) {
-		__xdp_return(netmem, mem_type, false, zc_frag);
+		__xdp_return(netmem, mem_type, zc_frag);
 	} else {
 		if (!tail)
 			skb_frag_off_add(frag, shrink);
diff --git a/net/core/xdp.c b/net/core/xdp.c
@@ -431,18 +431,18 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
  * of xdp_frames/pages in those cases.
  */
 void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
-		  bool napi_direct, struct xdp_buff *xdp)
+		  struct xdp_buff *xdp)
 {
 	switch (mem_type) {
 	case MEM_TYPE_PAGE_POOL:
 		netmem = netmem_compound_head(netmem);
-		if (napi_direct && xdp_return_frame_no_direct())
-			napi_direct = false;
+
 		/* No need to check netmem_is_pp() as mem->type knows this a
 		 * page_pool page
+		 *
+		 * page_pool can detect direct recycle.
 		 */
-		page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
-					  napi_direct);
+		page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
 		break;
 	case MEM_TYPE_PAGE_SHARED:
 		page_frag_free(__netmem_address(netmem));
@@ -471,10 +471,10 @@ void xdp_return_frame(struct xdp_frame *xdpf)
 	sinfo = xdp_get_shared_info_from_frame(xdpf);
 	for (u32 i = 0; i < sinfo->nr_frags; i++)
 		__xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
-			     false, NULL);
+			     NULL);
 
 out:
-	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
+	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
@@ -488,10 +488,10 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 	sinfo = xdp_get_shared_info_from_frame(xdpf);
 	for (u32 i = 0; i < sinfo->nr_frags; i++)
 		__xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
-			     true, NULL);
+			     NULL);
 
 out:
-	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
+	__xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
@@ -542,7 +542,7 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
  */
 void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
 {
-	__xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+	__xdp_return(netmem, xdp->rxq->mem.type, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frag);
 
@@ -556,10 +556,10 @@ void xdp_return_buff(struct xdp_buff *xdp)
 	sinfo = xdp_get_shared_info_from_buff(xdp);
 	for (u32 i = 0; i < sinfo->nr_frags; i++)
 		__xdp_return(skb_frag_netmem(&sinfo->frags[i]),
-			     xdp->rxq->mem.type, true, xdp);
+			     xdp->rxq->mem.type, xdp);
 
 out:
-	__xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
+	__xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, xdp);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 

Original file line number	Diff line number	Diff line change
`@@ -446,7 +446,7 @@ struct xdp_frame xdp_convert_buff_to_frame(struct xdp_buff xdp)`
`446`	`446`	`}`
`447`	`447`
`448`	`448`	`void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,`
`449`		`- bool napi_direct, struct xdp_buff *xdp);`
	`449`	`+ struct xdp_buff *xdp);`
`450`	`450`	`void xdp_return_frame(struct xdp_frame *xdpf);`
`451`	`451`	`void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);`
`452`	`452`	`void xdp_return_buff(struct xdp_buff *xdp);`
Original file line number	Diff line number	Diff line change
`@@ -4195,7 +4195,7 @@ static bool bpf_xdp_shrink_data(struct xdp_buff xdp, skb_frag_t frag,`
`4195`	`4195`	`}`
`4196`	`4196`
`4197`	`4197`	`if (release) {`
`4198`		`- __xdp_return(netmem, mem_type, false, zc_frag);`
	`4198`	`+ __xdp_return(netmem, mem_type, zc_frag);`
`4199`	`4199`	`} else {`
`4200`	`4200`	`if (!tail)`
`4201`	`4201`	`skb_frag_off_add(frag, shrink);`