Skip to content

Commit 0a7713a

Browse files
committed
Merge patch series "reduce tasklist_lock hold time on exit and do some pid cleanup"
Mateusz Guzik <[email protected]> says: The clone side contends against exit side in a way which avoidably exacerbates the problem by the latter waiting on locks held by the former while holding the tasklist_lock. Whacking this for both add_device_randomness and pids allocation gives me a 15% speed up for thread creation/destruction in a 24-core vm. The random patch is worth about 4%. The new bottleneck is pidmap_lock itself, with the biggest problem being the allocation itself taking the lock *twice*. Bench (plop into will-it-scale): $ cat tests/threadspawn1.c char *testcase_description = "Thread creation and teardown"; static void *worker(void *arg) { return (NULL); } void testcase(unsigned long long *iterations, unsigned long nr) { pthread_t thread; int error; while (1) { error = pthread_create(&thread, NULL, worker, NULL); assert(error == 0); error = pthread_join(thread, NULL); assert(error == 0); (*iterations)++; } } * patches from https://lore.kernel.org/r/[email protected]: pid: drop irq disablement around pidmap_lock pid: perform free_pid() calls outside of tasklist_lock pid: sprinkle tasklist_lock asserts exit: hoist get_pid() in release_task() outside of tasklist_lock exit: perform add_device_randomness() without tasklist_lock Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents 33be3ff + 627454c commit 0a7713a

File tree

4 files changed

+82
-57
lines changed

4 files changed

+82
-57
lines changed

include/linux/pid.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
101101
* these helpers must be called with the tasklist_lock write-held.
102102
*/
103103
extern void attach_pid(struct task_struct *task, enum pid_type);
104-
extern void detach_pid(struct task_struct *task, enum pid_type);
105-
extern void change_pid(struct task_struct *task, enum pid_type,
106-
struct pid *pid);
104+
void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type);
105+
void change_pid(struct pid **pids, struct task_struct *task, enum pid_type,
106+
struct pid *pid);
107107
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
108108
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
109109
enum pid_type);
@@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
129129
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
130130
size_t set_tid_size);
131131
extern void free_pid(struct pid *pid);
132+
void free_pids(struct pid **pids);
132133
extern void disable_pid_allocation(struct pid_namespace *ns);
133134

134135
/*

kernel/exit.c

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,22 @@ static __init int kernel_exit_sysfs_init(void)
122122
late_initcall(kernel_exit_sysfs_init);
123123
#endif
124124

125-
static void __unhash_process(struct task_struct *p, bool group_dead)
125+
/*
126+
* For things release_task() would like to do *after* tasklist_lock is released.
127+
*/
128+
struct release_task_post {
129+
struct pid *pids[PIDTYPE_MAX];
130+
};
131+
132+
static void __unhash_process(struct release_task_post *post, struct task_struct *p,
133+
bool group_dead)
126134
{
127135
nr_threads--;
128-
detach_pid(p, PIDTYPE_PID);
136+
detach_pid(post->pids, p, PIDTYPE_PID);
129137
if (group_dead) {
130-
detach_pid(p, PIDTYPE_TGID);
131-
detach_pid(p, PIDTYPE_PGID);
132-
detach_pid(p, PIDTYPE_SID);
138+
detach_pid(post->pids, p, PIDTYPE_TGID);
139+
detach_pid(post->pids, p, PIDTYPE_PGID);
140+
detach_pid(post->pids, p, PIDTYPE_SID);
133141

134142
list_del_rcu(&p->tasks);
135143
list_del_init(&p->sibling);
@@ -141,7 +149,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
141149
/*
142150
* This function expects the tasklist_lock write-locked.
143151
*/
144-
static void __exit_signal(struct task_struct *tsk)
152+
static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
145153
{
146154
struct signal_struct *sig = tsk->signal;
147155
bool group_dead = thread_group_leader(tsk);
@@ -174,9 +182,6 @@ static void __exit_signal(struct task_struct *tsk)
174182
sig->curr_target = next_thread(tsk);
175183
}
176184

177-
add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
178-
sizeof(unsigned long long));
179-
180185
/*
181186
* Accumulate here the counters for all threads as they die. We could
182187
* skip the group leader because it is the last user of signal_struct,
@@ -197,7 +202,7 @@ static void __exit_signal(struct task_struct *tsk)
197202
task_io_accounting_add(&sig->ioac, &tsk->ioac);
198203
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
199204
sig->nr_threads--;
200-
__unhash_process(tsk, group_dead);
205+
__unhash_process(post, tsk, group_dead);
201206
write_sequnlock(&sig->stats_lock);
202207

203208
tsk->sighand = NULL;
@@ -231,10 +236,13 @@ void __weak release_thread(struct task_struct *dead_task)
231236

232237
void release_task(struct task_struct *p)
233238
{
239+
struct release_task_post post;
234240
struct task_struct *leader;
235241
struct pid *thread_pid;
236242
int zap_leader;
237243
repeat:
244+
memset(&post, 0, sizeof(post));
245+
238246
/* don't need to get the RCU readlock here - the process is dead and
239247
* can't be modifying its own credentials. But shut RCU-lockdep up */
240248
rcu_read_lock();
@@ -243,10 +251,11 @@ void release_task(struct task_struct *p)
243251

244252
cgroup_release(p);
245253

254+
thread_pid = get_pid(p->thread_pid);
255+
246256
write_lock_irq(&tasklist_lock);
247257
ptrace_release_task(p);
248-
thread_pid = get_pid(p->thread_pid);
249-
__exit_signal(p);
258+
__exit_signal(&post, p);
250259

251260
/*
252261
* If we are the last non-leader member of the thread
@@ -270,6 +279,9 @@ void release_task(struct task_struct *p)
270279
write_unlock_irq(&tasklist_lock);
271280
proc_flush_pid(thread_pid);
272281
put_pid(thread_pid);
282+
add_device_randomness(&p->se.sum_exec_runtime,
283+
sizeof(p->se.sum_exec_runtime));
284+
free_pids(post.pids);
273285
release_thread(p);
274286
/*
275287
* This task was already removed from the process/thread/pid lists

kernel/pid.c

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = {
8888
};
8989
EXPORT_SYMBOL_GPL(init_pid_ns);
9090

91-
/*
92-
* Note: disable interrupts while the pidmap_lock is held as an
93-
* interrupt might come in and do read_lock(&tasklist_lock).
94-
*
95-
* If we don't disable interrupts there is a nasty deadlock between
96-
* detach_pid()->free_pid() and another cpu that does
97-
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
98-
* read_lock(&tasklist_lock);
99-
*
100-
* After we clean up the tasklist_lock and know there are no
101-
* irq handlers that take it we can leave the interrupts enabled.
102-
* For now it is easier to be safe than to prove it can't happen.
103-
*/
104-
10591
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
10692
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
10793

@@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp)
128114

129115
void free_pid(struct pid *pid)
130116
{
131-
/* We can be called with write_lock_irq(&tasklist_lock) held */
132117
int i;
133-
unsigned long flags;
134118

135-
spin_lock_irqsave(&pidmap_lock, flags);
119+
lockdep_assert_not_held(&tasklist_lock);
120+
121+
spin_lock(&pidmap_lock);
136122
for (i = 0; i <= pid->level; i++) {
137123
struct upid *upid = pid->numbers + i;
138124
struct pid_namespace *ns = upid->ns;
@@ -155,11 +141,23 @@ void free_pid(struct pid *pid)
155141
idr_remove(&ns->idr, upid->nr);
156142
}
157143
pidfs_remove_pid(pid);
158-
spin_unlock_irqrestore(&pidmap_lock, flags);
144+
spin_unlock(&pidmap_lock);
159145

160146
call_rcu(&pid->rcu, delayed_put_pid);
161147
}
162148

149+
void free_pids(struct pid **pids)
150+
{
151+
int tmp;
152+
153+
/*
154+
* This can batch pidmap_lock.
155+
*/
156+
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
157+
if (pids[tmp])
158+
free_pid(pids[tmp]);
159+
}
160+
163161
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
164162
size_t set_tid_size)
165163
{
@@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
211209
}
212210

213211
idr_preload(GFP_KERNEL);
214-
spin_lock_irq(&pidmap_lock);
212+
spin_lock(&pidmap_lock);
215213

216214
if (tid) {
217215
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
238236
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
239237
pid_max, GFP_ATOMIC);
240238
}
241-
spin_unlock_irq(&pidmap_lock);
239+
spin_unlock(&pidmap_lock);
242240
idr_preload_end();
243241

244242
if (nr < 0) {
@@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
272270

273271
upid = pid->numbers + ns->level;
274272
idr_preload(GFP_KERNEL);
275-
spin_lock_irq(&pidmap_lock);
273+
spin_lock(&pidmap_lock);
276274
if (!(ns->pid_allocated & PIDNS_ADDING))
277275
goto out_unlock;
278276
pidfs_add_pid(pid);
@@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
281279
idr_replace(&upid->ns->idr, pid, upid->nr);
282280
upid->ns->pid_allocated++;
283281
}
284-
spin_unlock_irq(&pidmap_lock);
282+
spin_unlock(&pidmap_lock);
285283
idr_preload_end();
286284

287285
return pid;
288286

289287
out_unlock:
290-
spin_unlock_irq(&pidmap_lock);
288+
spin_unlock(&pidmap_lock);
291289
idr_preload_end();
292290
put_pid_ns(ns);
293291

294292
out_free:
295-
spin_lock_irq(&pidmap_lock);
293+
spin_lock(&pidmap_lock);
296294
while (++i <= ns->level) {
297295
upid = pid->numbers + i;
298296
idr_remove(&upid->ns->idr, upid->nr);
@@ -302,17 +300,17 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
302300
if (ns->pid_allocated == PIDNS_ADDING)
303301
idr_set_cursor(&ns->idr, 0);
304302

305-
spin_unlock_irq(&pidmap_lock);
303+
spin_unlock(&pidmap_lock);
306304

307305
kmem_cache_free(ns->pid_cachep, pid);
308306
return ERR_PTR(retval);
309307
}
310308

311309
void disable_pid_allocation(struct pid_namespace *ns)
312310
{
313-
spin_lock_irq(&pidmap_lock);
311+
spin_lock(&pidmap_lock);
314312
ns->pid_allocated &= ~PIDNS_ADDING;
315-
spin_unlock_irq(&pidmap_lock);
313+
spin_unlock(&pidmap_lock);
316314
}
317315

318316
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
339337
*/
340338
void attach_pid(struct task_struct *task, enum pid_type type)
341339
{
342-
struct pid *pid = *task_pid_ptr(task, type);
340+
struct pid *pid;
341+
342+
lockdep_assert_held_write(&tasklist_lock);
343+
344+
pid = *task_pid_ptr(task, type);
343345
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
344346
}
345347

346-
static void __change_pid(struct task_struct *task, enum pid_type type,
347-
struct pid *new)
348+
static void __change_pid(struct pid **pids, struct task_struct *task,
349+
enum pid_type type, struct pid *new)
348350
{
349-
struct pid **pid_ptr = task_pid_ptr(task, type);
350-
struct pid *pid;
351+
struct pid **pid_ptr, *pid;
351352
int tmp;
352353

354+
lockdep_assert_held_write(&tasklist_lock);
355+
356+
pid_ptr = task_pid_ptr(task, type);
353357
pid = *pid_ptr;
354358

355359
hlist_del_rcu(&task->pid_links[type]);
@@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
364368
if (pid_has_task(pid, tmp))
365369
return;
366370

367-
free_pid(pid);
371+
WARN_ON(pids[type]);
372+
pids[type] = pid;
368373
}
369374

370-
void detach_pid(struct task_struct *task, enum pid_type type)
375+
void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
371376
{
372-
__change_pid(task, type, NULL);
377+
__change_pid(pids, task, type, NULL);
373378
}
374379

375-
void change_pid(struct task_struct *task, enum pid_type type,
380+
void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
376381
struct pid *pid)
377382
{
378-
__change_pid(task, type, pid);
383+
__change_pid(pids, task, type, pid);
379384
attach_pid(task, type);
380385
}
381386

@@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
386391
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
387392
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
388393

394+
lockdep_assert_held_write(&tasklist_lock);
395+
389396
/* Swap the single entry tid lists */
390397
hlists_swap_heads_rcu(head1, head2);
391398

@@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new,
403410
enum pid_type type)
404411
{
405412
WARN_ON_ONCE(type == PIDTYPE_PID);
413+
lockdep_assert_held_write(&tasklist_lock);
406414
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
407415
}
408416

kernel/sys.c

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
10851085
{
10861086
struct task_struct *p;
10871087
struct task_struct *group_leader = current->group_leader;
1088+
struct pid *pids[PIDTYPE_MAX] = { 0 };
10881089
struct pid *pgrp;
10891090
int err;
10901091

@@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
11421143
goto out;
11431144

11441145
if (task_pgrp(p) != pgrp)
1145-
change_pid(p, PIDTYPE_PGID, pgrp);
1146+
change_pid(pids, p, PIDTYPE_PGID, pgrp);
11461147

11471148
err = 0;
11481149
out:
11491150
/* All paths lead to here, thus we are safe. -DaveM */
11501151
write_unlock_irq(&tasklist_lock);
11511152
rcu_read_unlock();
1153+
free_pids(pids);
11521154
return err;
11531155
}
11541156

@@ -1222,21 +1224,22 @@ SYSCALL_DEFINE1(getsid, pid_t, pid)
12221224
return retval;
12231225
}
12241226

1225-
static void set_special_pids(struct pid *pid)
1227+
static void set_special_pids(struct pid **pids, struct pid *pid)
12261228
{
12271229
struct task_struct *curr = current->group_leader;
12281230

12291231
if (task_session(curr) != pid)
1230-
change_pid(curr, PIDTYPE_SID, pid);
1232+
change_pid(pids, curr, PIDTYPE_SID, pid);
12311233

12321234
if (task_pgrp(curr) != pid)
1233-
change_pid(curr, PIDTYPE_PGID, pid);
1235+
change_pid(pids, curr, PIDTYPE_PGID, pid);
12341236
}
12351237

12361238
int ksys_setsid(void)
12371239
{
12381240
struct task_struct *group_leader = current->group_leader;
12391241
struct pid *sid = task_pid(group_leader);
1242+
struct pid *pids[PIDTYPE_MAX] = { 0 };
12401243
pid_t session = pid_vnr(sid);
12411244
int err = -EPERM;
12421245

@@ -1252,13 +1255,14 @@ int ksys_setsid(void)
12521255
goto out;
12531256

12541257
group_leader->signal->leader = 1;
1255-
set_special_pids(sid);
1258+
set_special_pids(pids, sid);
12561259

12571260
proc_clear_tty(group_leader);
12581261

12591262
err = session;
12601263
out:
12611264
write_unlock_irq(&tasklist_lock);
1265+
free_pids(pids);
12621266
if (err > 0) {
12631267
proc_sid_connector(group_leader);
12641268
sched_autogroup_create_attach(group_leader);

0 commit comments

Comments
 (0)