Skip to content

Commit 24d9e8b

Browse files
committed
Merge tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab
Pull slab updates from Vlastimil Babka: - A new layer for caching objects for allocation and free via percpu arrays called sheaves. The aim is to combine the good parts of SLAB (lower-overhead and simpler percpu caching, compared to SLUB) without the past issues with arrays for freeing remote NUMA node objects and their flushing. It also allows more efficient kfree_rcu(), and cheaper object preallocations for cases where the exact number of objects is unknown, but an upper bound is. Currently VMAs and maple nodes are using this new caching, with a plan to enable it for all caches and remove the complex SLUB fastpath based on cpu (partial) slabs and this_cpu_cmpxchg_double(). (Vlastimil Babka, with Liam Howlett and Pedro Falcato for the maple tree changes) - Re-entrant kmalloc_nolock(), which allows opportunistic allocations from NMI and tracing/kprobe contexts. Building on prior page allocator and memcg changes, it will result in removing BPF-specific caches on top of slab (Alexei Starovoitov) - Various fixes and cleanups. (Kuan-Wei Chiu, Matthew Wilcox, Suren Baghdasaryan, Ye Liu) * tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (40 commits) slab: Introduce kmalloc_nolock() and kfree_nolock(). slab: Reuse first bit for OBJEXTS_ALLOC_FAIL slab: Make slub local_(try)lock more precise for LOCKDEP mm: Introduce alloc_frozen_pages_nolock() mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock(). locking/local_lock: Introduce local_lock_is_locked(). maple_tree: Convert forking to use the sheaf interface maple_tree: Add single node allocation support to maple state maple_tree: Prefilled sheaf conversion and testing tools/testing: Add support for prefilled slab sheafs maple_tree: Replace mt_free_one() with kfree() maple_tree: Use kfree_rcu in ma_free_rcu testing/radix-tree/maple: Hack around kfree_rcu not existing tools/testing: include maple-shim.c in maple.c maple_tree: use percpu sheaves for maple_node_cache mm, vma: use percpu sheaves for vm_area_struct cache tools/testing: Add support for changes to slab for sheaves slab: allow NUMA restricted allocations to use percpu sheaves tools/testing/vma: Implement vm_refcnt reset slab: skip percpu sheaves for remote object freeing ...
2 parents 07fdad3 + ca74b8c commit 24d9e8b

File tree

27 files changed

+4462
-3145
lines changed

27 files changed

+4462
-3145
lines changed

include/linux/gfp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
354354
}
355355
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
356356

357-
struct page *alloc_pages_nolock_noprof(int nid, unsigned int order);
357+
struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
358358
#define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__))
359359

360360
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);

include/linux/kasan.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
200200
}
201201

202202
bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
203-
bool still_accessible);
203+
bool still_accessible, bool no_quarantine);
204204
/**
205205
* kasan_slab_free - Poison, initialize, and quarantine a slab object.
206206
* @object: Object to be freed.
@@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
226226
* @Return true if KASAN took ownership of the object; false otherwise.
227227
*/
228228
static __always_inline bool kasan_slab_free(struct kmem_cache *s,
229-
void *object, bool init,
230-
bool still_accessible)
229+
void *object, bool init,
230+
bool still_accessible,
231+
bool no_quarantine)
231232
{
232233
if (kasan_enabled())
233-
return __kasan_slab_free(s, object, init, still_accessible);
234+
return __kasan_slab_free(s, object, init, still_accessible,
235+
no_quarantine);
234236
return false;
235237
}
236238

@@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object)
427429
}
428430

429431
static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
430-
bool init, bool still_accessible)
432+
bool init, bool still_accessible,
433+
bool no_quarantine)
431434
{
432435
return false;
433436
}

include/linux/local_lock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@
6666
*/
6767
#define local_trylock(lock) __local_trylock(this_cpu_ptr(lock))
6868

69+
#define local_lock_is_locked(lock) __local_lock_is_locked(lock)
70+
6971
/**
7072
* local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
7173
* interrupts if acquired

include/linux/local_lock_internal.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@ typedef struct {
1717

1818
/* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */
1919
typedef struct {
20-
local_lock_t llock;
20+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
21+
struct lockdep_map dep_map;
22+
struct task_struct *owner;
23+
#endif
2124
u8 acquired;
2225
} local_trylock_t;
2326

@@ -31,7 +34,7 @@ typedef struct {
3134
.owner = NULL,
3235

3336
# define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \
34-
.llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) },
37+
LOCAL_LOCK_DEBUG_INIT(lockname)
3538

3639
static inline void local_lock_acquire(local_lock_t *l)
3740
{
@@ -81,7 +84,7 @@ do { \
8184
local_lock_debug_init(lock); \
8285
} while (0)
8386

84-
#define __local_trylock_init(lock) __local_lock_init(lock.llock)
87+
#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock)
8588

8689
#define __spinlock_nested_bh_init(lock) \
8790
do { \
@@ -162,6 +165,9 @@ do { \
162165
!!tl; \
163166
})
164167

168+
/* preemption or migration must be disabled before calling __local_lock_is_locked */
169+
#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired)
170+
165171
#define __local_lock_release(lock) \
166172
do { \
167173
local_trylock_t *tl; \
@@ -282,4 +288,8 @@ do { \
282288
__local_trylock(lock); \
283289
})
284290

291+
/* migration must be disabled before calling __local_lock_is_locked */
292+
#define __local_lock_is_locked(__lock) \
293+
(rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current)
294+
285295
#endif /* CONFIG_PREEMPT_RT */

include/linux/maple_tree.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,9 @@ struct ma_state {
442442
struct maple_enode *node; /* The node containing this entry */
443443
unsigned long min; /* The minimum index of this node - implied pivot min */
444444
unsigned long max; /* The maximum index of this node - implied pivot max */
445-
struct maple_alloc *alloc; /* Allocated nodes for this operation */
445+
struct slab_sheaf *sheaf; /* Allocated nodes for this operation */
446+
struct maple_node *alloc; /* A single allocated node for fast path writes */
447+
unsigned long node_request; /* The number of nodes to allocate for this operation */
446448
enum maple_status status; /* The status of the state (active, start, none, etc) */
447449
unsigned char depth; /* depth of tree descent during write */
448450
unsigned char offset;
@@ -490,7 +492,9 @@ struct ma_wr_state {
490492
.status = ma_start, \
491493
.min = 0, \
492494
.max = ULONG_MAX, \
495+
.sheaf = NULL, \
493496
.alloc = NULL, \
497+
.node_request = 0, \
494498
.mas_flags = 0, \
495499
.store_type = wr_invalid, \
496500
}

include/linux/memcontrol.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,17 +341,25 @@ enum page_memcg_data_flags {
341341
__NR_MEMCG_DATA_FLAGS = (1UL << 2),
342342
};
343343

344+
#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS
344345
#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS
345346

346347
#else /* CONFIG_MEMCG */
347348

349+
#define __OBJEXTS_ALLOC_FAIL (1UL << 0)
348350
#define __FIRST_OBJEXT_FLAG (1UL << 0)
349351

350352
#endif /* CONFIG_MEMCG */
351353

352354
enum objext_flags {
353-
/* slabobj_ext vector failed to allocate */
354-
OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
355+
/*
356+
* Use bit 0 with zero other bits to signal that slabobj_ext vector
357+
* failed to allocate. The same bit 0 with valid upper bits means
358+
* MEMCG_DATA_OBJEXTS.
359+
*/
360+
OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL,
361+
/* slabobj_ext vector allocated with kmalloc_nolock() */
362+
OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG,
355363
/* the next bit after the last actual flag */
356364
__NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1),
357365
};

include/linux/rtmutex.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock)
4444
return READ_ONCE(lock->owner) != NULL;
4545
}
4646

47+
#ifdef CONFIG_RT_MUTEXES
48+
#define RT_MUTEX_HAS_WAITERS 1UL
49+
50+
static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
51+
{
52+
unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
53+
54+
return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
55+
}
56+
#endif
4757
extern void rt_mutex_base_init(struct rt_mutex_base *rtb);
4858

4959
/**

include/linux/slab.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,37 @@ struct kmem_cache_args {
335335
* %NULL means no constructor.
336336
*/
337337
void (*ctor)(void *);
338+
/**
339+
* @sheaf_capacity: Enable sheaves of given capacity for the cache.
340+
*
341+
* With a non-zero value, allocations from the cache go through caching
342+
* arrays called sheaves. Each cpu has a main sheaf that's always
343+
* present, and a spare sheaf that may be not present. When both become
344+
* empty, there's an attempt to replace an empty sheaf with a full sheaf
345+
* from the per-node barn.
346+
*
347+
* When no full sheaf is available, and gfp flags allow blocking, a
348+
* sheaf is allocated and filled from slab(s) using bulk allocation.
349+
* Otherwise the allocation falls back to the normal operation
350+
* allocating a single object from a slab.
351+
*
352+
* Analogically when freeing and both percpu sheaves are full, the barn
353+
* may replace it with an empty sheaf, unless it's over capacity. In
354+
* that case a sheaf is bulk freed to slab pages.
355+
*
356+
* The sheaves do not enforce NUMA placement of objects, so allocations
357+
* via kmem_cache_alloc_node() with a node specified other than
358+
* NUMA_NO_NODE will bypass them.
359+
*
360+
* Bulk allocation and free operations also try to use the cpu sheaves
361+
* and barn, but fallback to using slab pages directly.
362+
*
363+
* When slub_debug is enabled for the cache, the sheaf_capacity argument
364+
* is ignored.
365+
*
366+
* %0 means no sheaves will be created.
367+
*/
368+
unsigned int sheaf_capacity;
338369
};
339370

340371
struct kmem_cache *__kmem_cache_create_args(const char *name,
@@ -470,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size,
470501
#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__))
471502

472503
void kfree(const void *objp);
504+
void kfree_nolock(const void *objp);
473505
void kfree_sensitive(const void *objp);
474506
size_t __ksize(const void *objp);
475507

@@ -798,6 +830,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
798830
int node) __assume_slab_alignment __malloc;
799831
#define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
800832

833+
struct slab_sheaf *
834+
kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size);
835+
836+
int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
837+
struct slab_sheaf **sheafp, unsigned int size);
838+
839+
void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
840+
struct slab_sheaf *sheaf);
841+
842+
void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp,
843+
struct slab_sheaf *sheaf) __assume_slab_alignment __malloc;
844+
#define kmem_cache_alloc_from_sheaf(...) \
845+
alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__))
846+
847+
unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf);
848+
801849
/*
802850
* These macros allow declaring a kmem_buckets * parameter alongside size, which
803851
* can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
@@ -910,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f
910958
}
911959
#define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__))
912960

961+
void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
962+
#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))
963+
913964
#define kmem_buckets_alloc(_b, _size, _flags) \
914965
alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
915966

kernel/bpf/stream.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void)
8383
struct bpf_stream_page *stream_page, *old_stream_page;
8484
struct page *page;
8585

86-
page = alloc_pages_nolock(NUMA_NO_NODE, 0);
86+
page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0);
8787
if (!page)
8888
return NULL;
8989
stream_page = page_address(page);

kernel/bpf/syscall.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ static bool can_alloc_pages(void)
583583
static struct page *__bpf_alloc_page(int nid)
584584
{
585585
if (!can_alloc_pages())
586-
return alloc_pages_nolock(nid, 0);
586+
return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0);
587587

588588
return alloc_pages_node(nid,
589589
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT

0 commit comments

Comments
 (0)