@@ -220,9 +220,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
220
220
#define NAPI_SKB_CACHE_BULK 16
221
221
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
222
222
223
+ #if PAGE_SIZE == SZ_4K
224
+
225
+ #define NAPI_HAS_SMALL_PAGE_FRAG 1
226
+ #define NAPI_SMALL_PAGE_PFMEMALLOC (nc ) ((nc).pfmemalloc)
227
+
228
+ /* specialized page frag allocator using a single order 0 page
229
+ * and slicing it into 1K sized fragment. Constrained to systems
230
+ * with a very limited amount of 1K fragments fitting a single
231
+ * page - to avoid excessive truesize underestimation
232
+ */
233
+
234
+ struct page_frag_1k {
235
+ void * va ;
236
+ u16 offset ;
237
+ bool pfmemalloc ;
238
+ };
239
+
240
+ static void * page_frag_alloc_1k (struct page_frag_1k * nc , gfp_t gfp )
241
+ {
242
+ struct page * page ;
243
+ int offset ;
244
+
245
+ offset = nc -> offset - SZ_1K ;
246
+ if (likely (offset >= 0 ))
247
+ goto use_frag ;
248
+
249
+ page = alloc_pages_node (NUMA_NO_NODE , gfp , 0 );
250
+ if (!page )
251
+ return NULL ;
252
+
253
+ nc -> va = page_address (page );
254
+ nc -> pfmemalloc = page_is_pfmemalloc (page );
255
+ offset = PAGE_SIZE - SZ_1K ;
256
+ page_ref_add (page , offset / SZ_1K );
257
+
258
+ use_frag :
259
+ nc -> offset = offset ;
260
+ return nc -> va + offset ;
261
+ }
262
+ #else
263
+
264
+ /* the small page is actually unused in this build; add dummy helpers
265
+ * to please the compiler and avoid later preprocessor's conditionals
266
+ */
267
+ #define NAPI_HAS_SMALL_PAGE_FRAG 0
268
+ #define NAPI_SMALL_PAGE_PFMEMALLOC (nc ) false
269
+
270
+ struct page_frag_1k {
271
+ };
272
+
273
+ static void * page_frag_alloc_1k (struct page_frag_1k * nc , gfp_t gfp_mask )
274
+ {
275
+ return NULL ;
276
+ }
277
+
278
+ #endif
279
+
223
280
struct napi_alloc_cache {
224
281
local_lock_t bh_lock ;
225
282
struct page_frag_cache page ;
283
+ struct page_frag_1k page_small ;
226
284
unsigned int skb_count ;
227
285
void * skb_cache [NAPI_SKB_CACHE_SIZE ];
228
286
};
@@ -232,6 +290,23 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
232
290
.bh_lock = INIT_LOCAL_LOCK (bh_lock ),
233
291
};
234
292
293
+ /* Double check that napi_get_frags() allocates skbs with
294
+ * skb->head being backed by slab, not a page fragment.
295
+ * This is to make sure bug fixed in 3226b158e67c
296
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
297
+ * does not accidentally come back.
298
+ */
299
+ void napi_get_frags_check (struct napi_struct * napi )
300
+ {
301
+ struct sk_buff * skb ;
302
+
303
+ local_bh_disable ();
304
+ skb = napi_get_frags (napi );
305
+ WARN_ON_ONCE (!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb -> head_frag );
306
+ napi_free_frags (napi );
307
+ local_bh_enable ();
308
+ }
309
+
235
310
void * __napi_alloc_frag_align (unsigned int fragsz , unsigned int align_mask )
236
311
{
237
312
struct napi_alloc_cache * nc = this_cpu_ptr (& napi_alloc_cache );
@@ -738,8 +813,10 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
738
813
739
814
/* If requested length is either too small or too big,
740
815
* we use kmalloc() for skb->head allocation.
816
+ * When the small frag allocator is available, prefer it over kmalloc
817
+ * for small fragments
741
818
*/
742
- if (len <= SKB_WITH_OVERHEAD (1024 ) ||
819
+ if ((! NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD (1024 ) ) ||
743
820
len > SKB_WITH_OVERHEAD (PAGE_SIZE ) ||
744
821
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA ))) {
745
822
skb = __alloc_skb (len , gfp_mask , SKB_ALLOC_RX | SKB_ALLOC_NAPI ,
@@ -749,16 +826,32 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
749
826
goto skb_success ;
750
827
}
751
828
752
- len = SKB_HEAD_ALIGN (len );
753
-
754
829
if (sk_memalloc_socks ())
755
830
gfp_mask |= __GFP_MEMALLOC ;
756
831
757
832
local_lock_nested_bh (& napi_alloc_cache .bh_lock );
758
833
nc = this_cpu_ptr (& napi_alloc_cache );
834
+ if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD (1024 )) {
835
+ /* we are artificially inflating the allocation size, but
836
+ * that is not as bad as it may look like, as:
837
+ * - 'len' less than GRO_MAX_HEAD makes little sense
838
+ * - On most systems, larger 'len' values lead to fragment
839
+ * size above 512 bytes
840
+ * - kmalloc would use the kmalloc-1k slab for such values
841
+ * - Builds with smaller GRO_MAX_HEAD will very likely do
842
+ * little networking, as that implies no WiFi and no
843
+ * tunnels support, and 32 bits arches.
844
+ */
845
+ len = SZ_1K ;
759
846
760
- data = page_frag_alloc (& nc -> page , len , gfp_mask );
761
- pfmemalloc = page_frag_cache_is_pfmemalloc (& nc -> page );
847
+ data = page_frag_alloc_1k (& nc -> page_small , gfp_mask );
848
+ pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC (nc -> page_small );
849
+ } else {
850
+ len = SKB_HEAD_ALIGN (len );
851
+
852
+ data = page_frag_alloc (& nc -> page , len , gfp_mask );
853
+ pfmemalloc = page_frag_cache_is_pfmemalloc (& nc -> page );
854
+ }
762
855
local_unlock_nested_bh (& napi_alloc_cache .bh_lock );
763
856
764
857
if (unlikely (!data ))
0 commit comments