Skip to content

Commit 0892b84

Browse files
committed
Reapply "net: skb: introduce and use a single page frag cache"
This reverts commit 011b033. Sabrina reports that the revert may trigger warnings due to intervening changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it and revisit once that part is also ironed out. Fixes: 011b033 ("Revert "net: skb: introduce and use a single page frag cache"") Reported-by: Sabrina Dubroca <[email protected]> Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 0d0b752 commit 0892b84

File tree

3 files changed

+99
-22
lines changed

3 files changed

+99
-22
lines changed

include/linux/netdevice.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4115,6 +4115,7 @@ void netif_receive_skb_list(struct list_head *head);
41154115
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
41164116
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
41174117
struct sk_buff *napi_get_frags(struct napi_struct *napi);
4118+
void napi_get_frags_check(struct napi_struct *napi);
41184119
gro_result_t napi_gro_frags(struct napi_struct *napi);
41194120

41204121
static inline void napi_free_frags(struct napi_struct *napi)

net/core/dev.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6920,23 +6920,6 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
69206920
list_add_rcu(&napi->dev_list, higher); /* adds after higher */
69216921
}
69226922

6923-
/* Double check that napi_get_frags() allocates skbs with
6924-
* skb->head being backed by slab, not a page fragment.
6925-
* This is to make sure bug fixed in 3226b158e67c
6926-
* ("net: avoid 32 x truesize under-estimation for tiny skbs")
6927-
* does not accidentally come back.
6928-
*/
6929-
static void napi_get_frags_check(struct napi_struct *napi)
6930-
{
6931-
struct sk_buff *skb;
6932-
6933-
local_bh_disable();
6934-
skb = napi_get_frags(napi);
6935-
WARN_ON_ONCE(skb && skb->head_frag);
6936-
napi_free_frags(napi);
6937-
local_bh_enable();
6938-
}
6939-
69406923
void netif_napi_add_weight_locked(struct net_device *dev,
69416924
struct napi_struct *napi,
69426925
int (*poll)(struct napi_struct *, int),

net/core/skbuff.c

Lines changed: 98 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
220220
#define NAPI_SKB_CACHE_BULK 16
221221
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
222222

223+
#if PAGE_SIZE == SZ_4K
224+
225+
#define NAPI_HAS_SMALL_PAGE_FRAG 1
226+
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
227+
228+
/* specialized page frag allocator using a single order 0 page
229+
* and slicing it into 1K sized fragment. Constrained to systems
230+
* with a very limited amount of 1K fragments fitting a single
231+
* page - to avoid excessive truesize underestimation
232+
*/
233+
234+
struct page_frag_1k {
235+
void *va;
236+
u16 offset;
237+
bool pfmemalloc;
238+
};
239+
240+
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
241+
{
242+
struct page *page;
243+
int offset;
244+
245+
offset = nc->offset - SZ_1K;
246+
if (likely(offset >= 0))
247+
goto use_frag;
248+
249+
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
250+
if (!page)
251+
return NULL;
252+
253+
nc->va = page_address(page);
254+
nc->pfmemalloc = page_is_pfmemalloc(page);
255+
offset = PAGE_SIZE - SZ_1K;
256+
page_ref_add(page, offset / SZ_1K);
257+
258+
use_frag:
259+
nc->offset = offset;
260+
return nc->va + offset;
261+
}
262+
#else
263+
264+
/* the small page is actually unused in this build; add dummy helpers
265+
* to please the compiler and avoid later preprocessor's conditionals
266+
*/
267+
#define NAPI_HAS_SMALL_PAGE_FRAG 0
268+
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
269+
270+
struct page_frag_1k {
271+
};
272+
273+
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
274+
{
275+
return NULL;
276+
}
277+
278+
#endif
279+
223280
struct napi_alloc_cache {
224281
local_lock_t bh_lock;
225282
struct page_frag_cache page;
283+
struct page_frag_1k page_small;
226284
unsigned int skb_count;
227285
void *skb_cache[NAPI_SKB_CACHE_SIZE];
228286
};
@@ -232,6 +290,23 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
232290
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
233291
};
234292

293+
/* Double check that napi_get_frags() allocates skbs with
294+
* skb->head being backed by slab, not a page fragment.
295+
* This is to make sure bug fixed in 3226b158e67c
296+
* ("net: avoid 32 x truesize under-estimation for tiny skbs")
297+
* does not accidentally come back.
298+
*/
299+
void napi_get_frags_check(struct napi_struct *napi)
300+
{
301+
struct sk_buff *skb;
302+
303+
local_bh_disable();
304+
skb = napi_get_frags(napi);
305+
WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
306+
napi_free_frags(napi);
307+
local_bh_enable();
308+
}
309+
235310
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
236311
{
237312
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -738,8 +813,10 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
738813

739814
/* If requested length is either too small or too big,
740815
* we use kmalloc() for skb->head allocation.
816+
* When the small frag allocator is available, prefer it over kmalloc
817+
* for small fragments
741818
*/
742-
if (len <= SKB_WITH_OVERHEAD(1024) ||
819+
if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
743820
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
744821
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
745822
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -749,16 +826,32 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
749826
goto skb_success;
750827
}
751828

752-
len = SKB_HEAD_ALIGN(len);
753-
754829
if (sk_memalloc_socks())
755830
gfp_mask |= __GFP_MEMALLOC;
756831

757832
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
758833
nc = this_cpu_ptr(&napi_alloc_cache);
834+
if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
835+
/* we are artificially inflating the allocation size, but
836+
* that is not as bad as it may look like, as:
837+
* - 'len' less than GRO_MAX_HEAD makes little sense
838+
* - On most systems, larger 'len' values lead to fragment
839+
* size above 512 bytes
840+
* - kmalloc would use the kmalloc-1k slab for such values
841+
* - Builds with smaller GRO_MAX_HEAD will very likely do
842+
* little networking, as that implies no WiFi and no
843+
* tunnels support, and 32 bits arches.
844+
*/
845+
len = SZ_1K;
759846

760-
data = page_frag_alloc(&nc->page, len, gfp_mask);
761-
pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
847+
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
848+
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
849+
} else {
850+
len = SKB_HEAD_ALIGN(len);
851+
852+
data = page_frag_alloc(&nc->page, len, gfp_mask);
853+
pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
854+
}
762855
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
763856

764857
if (unlikely(!data))

0 commit comments

Comments
 (0)