Skip to content

Commit e987af4

Browse files
committed
Merge tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu
Pull percpu updates from Dennis Zhou: "One bigger change to percpu_counter's api allowing for init and destroy of multiple counters via percpu_counter_init_many() and percpu_counter_destroy_many(). This is used to help begin remediating a performance regression with percpu rss stats. Additionally, it seems larger core count machines are feeling the burden of the single threaded allocation of percpu. Mateusz is thinking about it and I will spend some time on it too. percpu: - A couple cleanups by Baoquan He and Bibo Mao. The only behavior change is to start printing messages if we're under the warn limit for failed atomic allocations. percpu_counter: - Shakeel introduced percpu counters into mm_struct which caused percpu allocations be on the hot path [1]. Originally I spent some time trying to improve the percpu allocator, but instead preferred what Mateusz Guzik proposed grouping at the allocation site, percpu_counter_init_many(). This allows a single percpu allocation to be shared by the counters. I like this approach because it creates a shared lifetime by the allocations. Additionally, I believe many inits have higher level synchronization requirements, like percpu_counter does against HOTPLUG_CPU. Therefore we can group these optimizations together" Link: https://lore.kernel.org/linux-mm/[email protected]/ [1] * tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu: kernel/fork: group allocation/free of per-cpu counters for mm struct pcpcntr: add group allocation/free mm/percpu.c: print error message too if atomic alloc failed mm/percpu.c: optimize the code in pcpu_setup_first_chunk() a little bit mm/percpu.c: remove redundant check mm/percpu: Remove some local variables in pcpu_populate_pte
2 parents 0fe2b86 + 14ef95b commit e987af4

File tree

4 files changed

+109
-78
lines changed

4 files changed

+109
-78
lines changed

include/linux/percpu_counter.h

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,28 @@ struct percpu_counter {
3030

3131
extern int percpu_counter_batch;
3232

33-
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
34-
struct lock_class_key *key);
33+
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
34+
gfp_t gfp, u32 nr_counters,
35+
struct lock_class_key *key);
3536

36-
#define percpu_counter_init(fbc, value, gfp) \
37+
#define percpu_counter_init_many(fbc, value, gfp, nr_counters) \
3738
({ \
3839
static struct lock_class_key __key; \
3940
\
40-
__percpu_counter_init(fbc, value, gfp, &__key); \
41+
__percpu_counter_init_many(fbc, value, gfp, nr_counters,\
42+
&__key); \
4143
})
4244

43-
void percpu_counter_destroy(struct percpu_counter *fbc);
45+
46+
#define percpu_counter_init(fbc, value, gfp) \
47+
percpu_counter_init_many(fbc, value, gfp, 1)
48+
49+
void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
50+
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
51+
{
52+
percpu_counter_destroy_many(fbc, 1);
53+
}
54+
4455
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
4556
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
4657
s32 batch);
@@ -116,11 +127,27 @@ struct percpu_counter {
116127
s64 count;
117128
};
118129

130+
static inline int percpu_counter_init_many(struct percpu_counter *fbc,
131+
s64 amount, gfp_t gfp,
132+
u32 nr_counters)
133+
{
134+
u32 i;
135+
136+
for (i = 0; i < nr_counters; i++)
137+
fbc[i].count = amount;
138+
139+
return 0;
140+
}
141+
119142
static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
120143
gfp_t gfp)
121144
{
122-
fbc->count = amount;
123-
return 0;
145+
return percpu_counter_init_many(fbc, amount, gfp, 1);
146+
}
147+
148+
static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
149+
u32 nr_counters)
150+
{
124151
}
125152

126153
static inline void percpu_counter_destroy(struct percpu_counter *fbc)

kernel/fork.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
909909
*/
910910
void __mmdrop(struct mm_struct *mm)
911911
{
912-
int i;
913-
914912
BUG_ON(mm == &init_mm);
915913
WARN_ON_ONCE(mm == current->mm);
916914

@@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
925923
put_user_ns(mm->user_ns);
926924
mm_pasid_drop(mm);
927925
mm_destroy_cid(mm);
926+
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
928927

929-
for (i = 0; i < NR_MM_COUNTERS; i++)
930-
percpu_counter_destroy(&mm->rss_stat[i]);
931928
free_mm(mm);
932929
}
933930
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1260,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
12601257
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
12611258
struct user_namespace *user_ns)
12621259
{
1263-
int i;
1264-
12651260
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
12661261
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
12671262
atomic_set(&mm->mm_users, 1);
@@ -1309,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
13091304
if (mm_alloc_cid(mm))
13101305
goto fail_cid;
13111306

1312-
for (i = 0; i < NR_MM_COUNTERS; i++)
1313-
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
1314-
goto fail_pcpu;
1307+
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
1308+
NR_MM_COUNTERS))
1309+
goto fail_pcpu;
13151310

13161311
mm->user_ns = get_user_ns(user_ns);
13171312
lru_gen_init_mm(mm);
13181313
return mm;
13191314

13201315
fail_pcpu:
1321-
while (i > 0)
1322-
percpu_counter_destroy(&mm->rss_stat[--i]);
13231316
mm_destroy_cid(mm);
13241317
fail_cid:
13251318
destroy_context(mm);

lib/percpu_counter.c

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -151,48 +151,72 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
151151
}
152152
EXPORT_SYMBOL(__percpu_counter_sum);
153153

154-
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
155-
struct lock_class_key *key)
154+
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
155+
gfp_t gfp, u32 nr_counters,
156+
struct lock_class_key *key)
156157
{
157158
unsigned long flags __maybe_unused;
158-
159-
raw_spin_lock_init(&fbc->lock);
160-
lockdep_set_class(&fbc->lock, key);
161-
fbc->count = amount;
162-
fbc->counters = alloc_percpu_gfp(s32, gfp);
163-
if (!fbc->counters)
159+
size_t counter_size;
160+
s32 __percpu *counters;
161+
u32 i;
162+
163+
counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
164+
counters = __alloc_percpu_gfp(nr_counters * counter_size,
165+
__alignof__(*counters), gfp);
166+
if (!counters) {
167+
fbc[0].counters = NULL;
164168
return -ENOMEM;
169+
}
165170

166-
debug_percpu_counter_activate(fbc);
171+
for (i = 0; i < nr_counters; i++) {
172+
raw_spin_lock_init(&fbc[i].lock);
173+
lockdep_set_class(&fbc[i].lock, key);
174+
#ifdef CONFIG_HOTPLUG_CPU
175+
INIT_LIST_HEAD(&fbc[i].list);
176+
#endif
177+
fbc[i].count = amount;
178+
fbc[i].counters = (void *)counters + (i * counter_size);
179+
180+
debug_percpu_counter_activate(&fbc[i]);
181+
}
167182

168183
#ifdef CONFIG_HOTPLUG_CPU
169-
INIT_LIST_HEAD(&fbc->list);
170184
spin_lock_irqsave(&percpu_counters_lock, flags);
171-
list_add(&fbc->list, &percpu_counters);
185+
for (i = 0; i < nr_counters; i++)
186+
list_add(&fbc[i].list, &percpu_counters);
172187
spin_unlock_irqrestore(&percpu_counters_lock, flags);
173188
#endif
174189
return 0;
175190
}
176-
EXPORT_SYMBOL(__percpu_counter_init);
191+
EXPORT_SYMBOL(__percpu_counter_init_many);
177192

178-
void percpu_counter_destroy(struct percpu_counter *fbc)
193+
void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
179194
{
180195
unsigned long flags __maybe_unused;
196+
u32 i;
197+
198+
if (WARN_ON_ONCE(!fbc))
199+
return;
181200

182-
if (!fbc->counters)
201+
if (!fbc[0].counters)
183202
return;
184203

185-
debug_percpu_counter_deactivate(fbc);
204+
for (i = 0; i < nr_counters; i++)
205+
debug_percpu_counter_deactivate(&fbc[i]);
186206

187207
#ifdef CONFIG_HOTPLUG_CPU
188208
spin_lock_irqsave(&percpu_counters_lock, flags);
189-
list_del(&fbc->list);
209+
for (i = 0; i < nr_counters; i++)
210+
list_del(&fbc[i].list);
190211
spin_unlock_irqrestore(&percpu_counters_lock, flags);
191212
#endif
192-
free_percpu(fbc->counters);
193-
fbc->counters = NULL;
213+
214+
free_percpu(fbc[0].counters);
215+
216+
for (i = 0; i < nr_counters; i++)
217+
fbc[i].counters = NULL;
194218
}
195-
EXPORT_SYMBOL(percpu_counter_destroy);
219+
EXPORT_SYMBOL(percpu_counter_destroy_many);
196220

197221
int percpu_counter_batch __read_mostly = 32;
198222
EXPORT_SYMBOL(percpu_counter_batch);

mm/percpu.c

Lines changed: 28 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1890,13 +1890,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
18901890
fail:
18911891
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
18921892

1893-
if (!is_atomic && do_warn && warn_limit) {
1893+
if (do_warn && warn_limit) {
18941894
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
18951895
size, align, is_atomic, err);
1896-
dump_stack();
1896+
if (!is_atomic)
1897+
dump_stack();
18971898
if (!--warn_limit)
18981899
pr_info("limit reached, disable warning\n");
18991900
}
1901+
19001902
if (is_atomic) {
19011903
/* see the flag handling in pcpu_balance_workfn() */
19021904
pcpu_atomic_alloc_failed = true;
@@ -2581,14 +2583,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
25812583
{
25822584
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
25832585
size_t static_size, dyn_size;
2584-
struct pcpu_chunk *chunk;
25852586
unsigned long *group_offsets;
25862587
size_t *group_sizes;
25872588
unsigned long *unit_off;
25882589
unsigned int cpu;
25892590
int *unit_map;
25902591
int group, unit, i;
2591-
int map_size;
25922592
unsigned long tmp_addr;
25932593
size_t alloc_size;
25942594

@@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
26152615
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
26162616
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
26172617
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2618-
PCPU_SETUP_BUG_ON(!ai->dyn_size);
26192618
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
26202619
PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
26212620
IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
@@ -2698,7 +2697,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
26982697
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
26992698
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
27002699
pcpu_atom_size = ai->atom_size;
2701-
pcpu_chunk_struct_size = struct_size(chunk, populated,
2700+
pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
27022701
BITS_TO_LONGS(pcpu_unit_pages));
27032702

27042703
pcpu_stats_save_ai(ai);
@@ -2735,29 +2734,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
27352734
dyn_size = ai->dyn_size - (static_size - ai->static_size);
27362735

27372736
/*
2738-
* Initialize first chunk.
2739-
* If the reserved_size is non-zero, this initializes the reserved
2740-
* chunk. If the reserved_size is zero, the reserved chunk is NULL
2741-
* and the dynamic region is initialized here. The first chunk,
2742-
* pcpu_first_chunk, will always point to the chunk that serves
2743-
* the dynamic region.
2737+
* Initialize first chunk:
2738+
* This chunk is broken up into 3 parts:
2739+
* < static | [reserved] | dynamic >
2740+
* - static - there is no backing chunk because these allocations can
2741+
* never be freed.
2742+
* - reserved (pcpu_reserved_chunk) - exists primarily to serve
2743+
* allocations from module load.
2744+
* - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
2745+
* chunk.
27442746
*/
27452747
tmp_addr = (unsigned long)base_addr + static_size;
2746-
map_size = ai->reserved_size ?: dyn_size;
2747-
chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2748-
2749-
/* init dynamic chunk if necessary */
2750-
if (ai->reserved_size) {
2751-
pcpu_reserved_chunk = chunk;
2748+
if (ai->reserved_size)
2749+
pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
2750+
ai->reserved_size);
2751+
tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
2752+
pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
27522753

2753-
tmp_addr = (unsigned long)base_addr + static_size +
2754-
ai->reserved_size;
2755-
map_size = dyn_size;
2756-
chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2757-
}
2758-
2759-
/* link the first chunk in */
2760-
pcpu_first_chunk = chunk;
27612754
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
27622755
pcpu_chunk_relocate(pcpu_first_chunk, -1);
27632756

@@ -3189,32 +3182,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
31893182
pmd_t *pmd;
31903183

31913184
if (pgd_none(*pgd)) {
3192-
p4d_t *new;
3193-
3194-
new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
3195-
if (!new)
3185+
p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
3186+
if (!p4d)
31963187
goto err_alloc;
3197-
pgd_populate(&init_mm, pgd, new);
3188+
pgd_populate(&init_mm, pgd, p4d);
31983189
}
31993190

32003191
p4d = p4d_offset(pgd, addr);
32013192
if (p4d_none(*p4d)) {
3202-
pud_t *new;
3203-
3204-
new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
3205-
if (!new)
3193+
pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
3194+
if (!pud)
32063195
goto err_alloc;
3207-
p4d_populate(&init_mm, p4d, new);
3196+
p4d_populate(&init_mm, p4d, pud);
32083197
}
32093198

32103199
pud = pud_offset(p4d, addr);
32113200
if (pud_none(*pud)) {
3212-
pmd_t *new;
3213-
3214-
new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
3215-
if (!new)
3201+
pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
3202+
if (!pmd)
32163203
goto err_alloc;
3217-
pud_populate(&init_mm, pud, new);
3204+
pud_populate(&init_mm, pud, pmd);
32183205
}
32193206

32203207
pmd = pmd_offset(pud, addr);

0 commit comments

Comments
 (0)