Skip to content

Commit cd97950

Browse files
committed
Merge tag 'slab-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab
Pull slab updates from Vlastimil Babka: "This time it's mostly random cleanups and fixes, with two performance fixes that might have significant impact, but limited to systems experiencing particular bad corner case scenarios rather than general performance improvements. The memcg hook changes are going through the mm tree due to dependencies. - Prevent stalls when reading /proc/slabinfo (Jianfeng Wang) This fixes the long-standing problem that can happen with workloads that have alloc/free patterns resulting in many partially used slabs (in e.g. dentry cache). Reading /proc/slabinfo will traverse the long partial slab list under spinlock with disabled irqs and thus can stall other processes or even trigger the lockup detection. The traversal is only done to count free objects so that <active_objs> column can be reported along with <num_objs>. To avoid affecting fast paths with another shared counter (attempted in the past) or complex partial list traversal schemes that allow rescheduling, the chosen solution resorts to approximation - when the partial list is over 10000 slabs long, we will only traverse first 5000 slabs from head and tail each and use the average of those to estimate the whole list. Both head and tail are used as the slabs near head to tend to have more free objects than the slabs towards the tail. It is expected the approximation should not break existing /proc/slabinfo consumers. The <num_objs> field is still accurate and reflects the overall kmem_cache footprint. The <active_objs> was already imprecise due to cpu and percpu-partial slabs, so can't be relied upon to determine exact cache usage. The difference between <active_objs> and <num_objs> is mainly useful to determine the slab fragmentation, and that will be possible even with the approximation in place. - Prevent allocating many slabs when a NUMA node is full (Chen Jun) Currently, on NUMA systems with a node under significantly bigger pressure than other nodes, the fallback strategy may result in each kmalloc_node() that can't be safisfied from the preferred node, to allocate a new slab on a fallback node, and not reuse the slabs already on that node's partial list. This is now fixed and partial lists of fallback nodes are checked even for kmalloc_node() allocations. It's still preferred to allocate a new slab on the requested node before a fallback, but only with a GFP_NOWAIT attempt, which will fail quickly when the node is under a significant memory pressure. - More SLAB removal related cleanups (Xiu Jianfeng, Hyunmin Lee) - Fix slub_kunit self-test with hardened freelists (Guenter Roeck) - Mark racy accesses for KCSAN (linke li) - Misc cleanups (Xiongwei Song, Haifeng Xu, Sangyun Kim)" * tag 'slab-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: mm/slub: remove the check for NULL kmalloc_caches mm/slub: create kmalloc 96 and 192 caches regardless cache size order mm/slub: mark racy access on slab->freelist slub: use count_partial_free_approx() in slab_out_of_memory() slub: introduce count_partial_free_approx() slub: Set __GFP_COMP in kmem_cache by default mm/slub: remove duplicate initialization for early_kmem_cache_node_alloc() mm/slub: correct comment in do_slab_free() mm/slub, kunit: Use inverted data to corrupt kmem cache mm/slub: simplify get_partial_node() mm/slub: add slub_get_cpu_partial() helper mm/slub: remove the check of !kmem_cache_has_cpu_partial() mm/slub: Reduce memory consumption in extreme scenarios mm/slub: mark racy accesses on slab->slabs mm/slub: remove dummy slabinfo functions
2 parents c07ea94 + 7338999 commit cd97950

File tree

4 files changed

+96
-54
lines changed

4 files changed

+96
-54
lines changed

lib/slub_kunit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ static void test_next_pointer(struct kunit *test)
5555

5656
ptr_addr = (unsigned long *)(p + s->offset);
5757
tmp = *ptr_addr;
58-
p[s->offset] = 0x12;
58+
p[s->offset] = ~p[s->offset];
5959

6060
/*
6161
* Expecting three errors.

mm/slab.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -496,9 +496,6 @@ struct slabinfo {
496496
};
497497

498498
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
499-
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
500-
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
501-
size_t count, loff_t *ppos);
502499

503500
#ifdef CONFIG_SLUB_DEBUG
504501
#ifdef CONFIG_SLUB_DEBUG_ON

mm/slab_common.c

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -916,22 +916,15 @@ void __init create_kmalloc_caches(void)
916916
* Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
917917
*/
918918
for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
919-
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
920-
if (!kmalloc_caches[type][i])
921-
new_kmalloc_cache(i, type);
922-
923-
/*
924-
* Caches that are not of the two-to-the-power-of size.
925-
* These have to be created immediately after the
926-
* earlier power of two caches
927-
*/
928-
if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
929-
!kmalloc_caches[type][1])
930-
new_kmalloc_cache(1, type);
931-
if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
932-
!kmalloc_caches[type][2])
933-
new_kmalloc_cache(2, type);
934-
}
919+
/* Caches that are NOT of the two-to-the-power-of size. */
920+
if (KMALLOC_MIN_SIZE <= 32)
921+
new_kmalloc_cache(1, type);
922+
if (KMALLOC_MIN_SIZE <= 64)
923+
new_kmalloc_cache(2, type);
924+
925+
/* Caches that are of the two-to-the-power-of size. */
926+
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
927+
new_kmalloc_cache(i, type);
935928
}
936929
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
937930
random_kmalloc_seed = get_random_u64();
@@ -1078,7 +1071,6 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
10781071
sinfo.limit, sinfo.batchcount, sinfo.shared);
10791072
seq_printf(m, " : slabdata %6lu %6lu %6lu",
10801073
sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1081-
slabinfo_show_stats(m, s);
10821074
seq_putc(m, '\n');
10831075
}
10841076

@@ -1155,7 +1147,6 @@ static const struct proc_ops slabinfo_proc_ops = {
11551147
.proc_flags = PROC_ENTRY_PERMANENT,
11561148
.proc_open = slabinfo_open,
11571149
.proc_read = seq_read,
1158-
.proc_write = slabinfo_write,
11591150
.proc_lseek = seq_lseek,
11601151
.proc_release = seq_release,
11611152
};

mm/slub.c

Lines changed: 86 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -624,11 +624,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
624624
nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
625625
s->cpu_partial_slabs = nr_slabs;
626626
}
627+
628+
static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
629+
{
630+
return s->cpu_partial_slabs;
631+
}
627632
#else
628633
static inline void
629634
slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
630635
{
631636
}
637+
638+
static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
639+
{
640+
return 0;
641+
}
632642
#endif /* CONFIG_SLUB_CPU_PARTIAL */
633643

634644
/*
@@ -2609,19 +2619,18 @@ static struct slab *get_partial_node(struct kmem_cache *s,
26092619
if (!partial) {
26102620
partial = slab;
26112621
stat(s, ALLOC_FROM_PARTIAL);
2622+
2623+
if ((slub_get_cpu_partial(s) == 0)) {
2624+
break;
2625+
}
26122626
} else {
26132627
put_cpu_partial(s, slab, 0);
26142628
stat(s, CPU_PARTIAL_NODE);
2615-
partial_slabs++;
2616-
}
2617-
#ifdef CONFIG_SLUB_CPU_PARTIAL
2618-
if (!kmem_cache_has_cpu_partial(s)
2619-
|| partial_slabs > s->cpu_partial_slabs / 2)
2620-
break;
2621-
#else
2622-
break;
2623-
#endif
26242629

2630+
if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
2631+
break;
2632+
}
2633+
}
26252634
}
26262635
spin_unlock_irqrestore(&n->list_lock, flags);
26272636
return partial;
@@ -2704,7 +2713,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node,
27042713
searchnode = numa_mem_id();
27052714

27062715
slab = get_partial_node(s, get_node(s, searchnode), pc);
2707-
if (slab || node != NUMA_NO_NODE)
2716+
if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
27082717
return slab;
27092718

27102719
return get_any_partial(s, pc);
@@ -2802,7 +2811,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
28022811
struct slab new;
28032812
struct slab old;
28042813

2805-
if (slab->freelist) {
2814+
if (READ_ONCE(slab->freelist)) {
28062815
stat(s, DEACTIVATE_REMOTE_FREES);
28072816
tail = DEACTIVATE_TO_TAIL;
28082817
}
@@ -3234,6 +3243,43 @@ static unsigned long count_partial(struct kmem_cache_node *n,
32343243
#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
32353244

32363245
#ifdef CONFIG_SLUB_DEBUG
3246+
#define MAX_PARTIAL_TO_SCAN 10000
3247+
3248+
static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
3249+
{
3250+
unsigned long flags;
3251+
unsigned long x = 0;
3252+
struct slab *slab;
3253+
3254+
spin_lock_irqsave(&n->list_lock, flags);
3255+
if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
3256+
list_for_each_entry(slab, &n->partial, slab_list)
3257+
x += slab->objects - slab->inuse;
3258+
} else {
3259+
/*
3260+
* For a long list, approximate the total count of objects in
3261+
* it to meet the limit on the number of slabs to scan.
3262+
* Scan from both the list's head and tail for better accuracy.
3263+
*/
3264+
unsigned long scanned = 0;
3265+
3266+
list_for_each_entry(slab, &n->partial, slab_list) {
3267+
x += slab->objects - slab->inuse;
3268+
if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
3269+
break;
3270+
}
3271+
list_for_each_entry_reverse(slab, &n->partial, slab_list) {
3272+
x += slab->objects - slab->inuse;
3273+
if (++scanned == MAX_PARTIAL_TO_SCAN)
3274+
break;
3275+
}
3276+
x = mult_frac(x, n->nr_partial, scanned);
3277+
x = min(x, node_nr_objs(n));
3278+
}
3279+
spin_unlock_irqrestore(&n->list_lock, flags);
3280+
return x;
3281+
}
3282+
32373283
static noinline void
32383284
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
32393285
{
@@ -3260,7 +3306,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
32603306
unsigned long nr_objs;
32613307
unsigned long nr_free;
32623308

3263-
nr_free = count_partial(n, count_free);
3309+
nr_free = count_partial_free_approx(n);
32643310
nr_slabs = node_nr_slabs(n);
32653311
nr_objs = node_nr_objs(n);
32663312

@@ -3380,6 +3426,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
33803426
struct slab *slab;
33813427
unsigned long flags;
33823428
struct partial_context pc;
3429+
bool try_thisnode = true;
33833430

33843431
stat(s, ALLOC_SLOWPATH);
33853432

@@ -3506,6 +3553,21 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
35063553
new_objects:
35073554

35083555
pc.flags = gfpflags;
3556+
/*
3557+
* When a preferred node is indicated but no __GFP_THISNODE
3558+
*
3559+
* 1) try to get a partial slab from target node only by having
3560+
* __GFP_THISNODE in pc.flags for get_partial()
3561+
* 2) if 1) failed, try to allocate a new slab from target node with
3562+
* GPF_NOWAIT | __GFP_THISNODE opportunistically
3563+
* 3) if 2) failed, retry with original gfpflags which will allow
3564+
* get_partial() try partial lists of other nodes before potentially
3565+
* allocating new page from other nodes
3566+
*/
3567+
if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
3568+
&& try_thisnode))
3569+
pc.flags = GFP_NOWAIT | __GFP_THISNODE;
3570+
35093571
pc.orig_size = orig_size;
35103572
slab = get_partial(s, node, &pc);
35113573
if (slab) {
@@ -3527,10 +3589,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
35273589
}
35283590

35293591
slub_put_cpu_ptr(s->cpu_slab);
3530-
slab = new_slab(s, gfpflags, node);
3592+
slab = new_slab(s, pc.flags, node);
35313593
c = slub_get_cpu_ptr(s->cpu_slab);
35323594

35333595
if (unlikely(!slab)) {
3596+
if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
3597+
&& try_thisnode) {
3598+
try_thisnode = false;
3599+
goto new_objects;
3600+
}
35343601
slab_out_of_memory(s, gfpflags, node);
35353602
return NULL;
35363603
}
@@ -4232,7 +4299,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
42324299
c = raw_cpu_ptr(s->cpu_slab);
42334300
tid = READ_ONCE(c->tid);
42344301

4235-
/* Same with comment on barrier() in slab_alloc_node() */
4302+
/* Same with comment on barrier() in __slab_alloc_node() */
42364303
barrier();
42374304

42384305
if (unlikely(slab != c->slab)) {
@@ -4853,7 +4920,6 @@ static void early_kmem_cache_node_alloc(int node)
48534920
BUG_ON(!n);
48544921
#ifdef CONFIG_SLUB_DEBUG
48554922
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
4856-
init_tracking(kmem_cache_node, n);
48574923
#endif
48584924
n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
48594925
slab->freelist = get_freepointer(kmem_cache_node, n);
@@ -5066,9 +5132,7 @@ static int calculate_sizes(struct kmem_cache *s)
50665132
if ((int)order < 0)
50675133
return 0;
50685134

5069-
s->allocflags = 0;
5070-
if (order)
5071-
s->allocflags |= __GFP_COMP;
5135+
s->allocflags = __GFP_COMP;
50725136

50735137
if (s->flags & SLAB_CACHE_DMA)
50745138
s->allocflags |= GFP_DMA;
@@ -6042,7 +6106,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
60426106
else if (flags & SO_OBJECTS)
60436107
WARN_ON_ONCE(1);
60446108
else
6045-
x = slab->slabs;
6109+
x = data_race(slab->slabs);
60466110
total += x;
60476111
nodes[node] += x;
60486112
}
@@ -6247,7 +6311,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
62476311
slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
62486312

62496313
if (slab)
6250-
slabs += slab->slabs;
6314+
slabs += data_race(slab->slabs);
62516315
}
62526316
#endif
62536317

@@ -6261,7 +6325,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
62616325

62626326
slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
62636327
if (slab) {
6264-
slabs = READ_ONCE(slab->slabs);
6328+
slabs = data_race(slab->slabs);
62656329
objects = (slabs * oo_objects(s->oo)) / 2;
62666330
len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
62676331
cpu, objects, slabs);
@@ -7095,7 +7159,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
70957159
for_each_kmem_cache_node(s, node, n) {
70967160
nr_slabs += node_nr_slabs(n);
70977161
nr_objs += node_nr_objs(n);
7098-
nr_free += count_partial(n, count_free);
7162+
nr_free += count_partial_free_approx(n);
70997163
}
71007164

71017165
sinfo->active_objs = nr_objs - nr_free;
@@ -7105,14 +7169,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
71057169
sinfo->objects_per_slab = oo_objects(s->oo);
71067170
sinfo->cache_order = oo_order(s->oo);
71077171
}
7108-
7109-
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
7110-
{
7111-
}
7112-
7113-
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
7114-
size_t count, loff_t *ppos)
7115-
{
7116-
return -EIO;
7117-
}
71187172
#endif /* CONFIG_SLUB_DEBUG */

0 commit comments

Comments
 (0)