@@ -220,9 +220,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
220220#define NAPI_SKB_CACHE_BULK 16
221221#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
222222
223+ #if PAGE_SIZE == SZ_4K
224+
225+ #define NAPI_HAS_SMALL_PAGE_FRAG 1
226+ #define NAPI_SMALL_PAGE_PFMEMALLOC (nc ) ((nc).pfmemalloc)
227+
228+ /* specialized page frag allocator using a single order 0 page
229+ * and slicing it into 1K sized fragment. Constrained to systems
230+ * with a very limited amount of 1K fragments fitting a single
231+ * page - to avoid excessive truesize underestimation
232+ */
233+
234+ struct page_frag_1k {
235+ void * va ;
236+ u16 offset ;
237+ bool pfmemalloc ;
238+ };
239+
240+ static void * page_frag_alloc_1k (struct page_frag_1k * nc , gfp_t gfp )
241+ {
242+ struct page * page ;
243+ int offset ;
244+
245+ offset = nc -> offset - SZ_1K ;
246+ if (likely (offset >= 0 ))
247+ goto use_frag ;
248+
249+ page = alloc_pages_node (NUMA_NO_NODE , gfp , 0 );
250+ if (!page )
251+ return NULL ;
252+
253+ nc -> va = page_address (page );
254+ nc -> pfmemalloc = page_is_pfmemalloc (page );
255+ offset = PAGE_SIZE - SZ_1K ;
256+ page_ref_add (page , offset / SZ_1K );
257+
258+ use_frag :
259+ nc -> offset = offset ;
260+ return nc -> va + offset ;
261+ }
262+ #else
263+
264+ /* the small page is actually unused in this build; add dummy helpers
265+ * to please the compiler and avoid later preprocessor's conditionals
266+ */
267+ #define NAPI_HAS_SMALL_PAGE_FRAG 0
268+ #define NAPI_SMALL_PAGE_PFMEMALLOC (nc ) false
269+
270+ struct page_frag_1k {
271+ };
272+
273+ static void * page_frag_alloc_1k (struct page_frag_1k * nc , gfp_t gfp_mask )
274+ {
275+ return NULL ;
276+ }
277+
278+ #endif
279+
223280struct napi_alloc_cache {
224281 local_lock_t bh_lock ;
225282 struct page_frag_cache page ;
283+ struct page_frag_1k page_small ;
226284 unsigned int skb_count ;
227285 void * skb_cache [NAPI_SKB_CACHE_SIZE ];
228286};
@@ -232,6 +290,23 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
232290 .bh_lock = INIT_LOCAL_LOCK (bh_lock ),
233291};
234292
293+ /* Double check that napi_get_frags() allocates skbs with
294+ * skb->head being backed by slab, not a page fragment.
295+ * This is to make sure bug fixed in 3226b158e67c
296+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
297+ * does not accidentally come back.
298+ */
299+ void napi_get_frags_check (struct napi_struct * napi )
300+ {
301+ struct sk_buff * skb ;
302+
303+ local_bh_disable ();
304+ skb = napi_get_frags (napi );
305+ WARN_ON_ONCE (!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb -> head_frag );
306+ napi_free_frags (napi );
307+ local_bh_enable ();
308+ }
309+
235310void * __napi_alloc_frag_align (unsigned int fragsz , unsigned int align_mask )
236311{
237312 struct napi_alloc_cache * nc = this_cpu_ptr (& napi_alloc_cache );
@@ -738,8 +813,10 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
738813
739814 /* If requested length is either too small or too big,
740815 * we use kmalloc() for skb->head allocation.
816+ * When the small frag allocator is available, prefer it over kmalloc
817+ * for small fragments
741818 */
742- if (len <= SKB_WITH_OVERHEAD (1024 ) ||
819+ if ((! NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD (1024 ) ) ||
743820 len > SKB_WITH_OVERHEAD (PAGE_SIZE ) ||
744821 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA ))) {
745822 skb = __alloc_skb (len , gfp_mask , SKB_ALLOC_RX | SKB_ALLOC_NAPI ,
@@ -749,16 +826,32 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
749826 goto skb_success ;
750827 }
751828
752- len = SKB_HEAD_ALIGN (len );
753-
754829 if (sk_memalloc_socks ())
755830 gfp_mask |= __GFP_MEMALLOC ;
756831
757832 local_lock_nested_bh (& napi_alloc_cache .bh_lock );
758833 nc = this_cpu_ptr (& napi_alloc_cache );
834+ if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD (1024 )) {
835+ /* we are artificially inflating the allocation size, but
836+ * that is not as bad as it may look like, as:
837+ * - 'len' less than GRO_MAX_HEAD makes little sense
838+ * - On most systems, larger 'len' values lead to fragment
839+ * size above 512 bytes
840+ * - kmalloc would use the kmalloc-1k slab for such values
841+ * - Builds with smaller GRO_MAX_HEAD will very likely do
842+ * little networking, as that implies no WiFi and no
843+ * tunnels support, and 32 bits arches.
844+ */
845+ len = SZ_1K ;
759846
760- data = page_frag_alloc (& nc -> page , len , gfp_mask );
761- pfmemalloc = page_frag_cache_is_pfmemalloc (& nc -> page );
847+ data = page_frag_alloc_1k (& nc -> page_small , gfp_mask );
848+ pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC (nc -> page_small );
849+ } else {
850+ len = SKB_HEAD_ALIGN (len );
851+
852+ data = page_frag_alloc (& nc -> page , len , gfp_mask );
853+ pfmemalloc = page_frag_cache_is_pfmemalloc (& nc -> page );
854+ }
762855 local_unlock_nested_bh (& napi_alloc_cache .bh_lock );
763856
764857 if (unlikely (!data ))
0 commit comments