Skip to content

Commit 7903f90

Browse files
mjguzikbrauner
authored andcommitted
pid: perform free_pid() calls outside of tasklist_lock
As the clone side already executes pid allocation with only pidmap_lock held, issuing free_pid() while still holding tasklist_lock exacerbates total hold time of the latter. More things may show up later which require initial clean up with the lock held and allow finishing without it. For that reason a struct to collect such work is added instead of merely passing the pid array. Reviewed-by: Oleg Nesterov <[email protected]> Signed-off-by: Mateusz Guzik <[email protected]> Link: https://lore.kernel.org/r/[email protected] Acked-by: "Liam R. Howlett" <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 74198dc commit 7903f90

File tree

4 files changed

+55
-38
lines changed

4 files changed

+55
-38
lines changed

include/linux/pid.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
101101
* these helpers must be called with the tasklist_lock write-held.
102102
*/
103103
extern void attach_pid(struct task_struct *task, enum pid_type);
104-
extern void detach_pid(struct task_struct *task, enum pid_type);
105-
extern void change_pid(struct task_struct *task, enum pid_type,
106-
struct pid *pid);
104+
void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type);
105+
void change_pid(struct pid **pids, struct task_struct *task, enum pid_type,
106+
struct pid *pid);
107107
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
108108
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
109109
enum pid_type);
@@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
129129
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
130130
size_t set_tid_size);
131131
extern void free_pid(struct pid *pid);
132+
void free_pids(struct pid **pids);
132133
extern void disable_pid_allocation(struct pid_namespace *ns);
133134

134135
/*

kernel/exit.c

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,22 @@ static __init int kernel_exit_sysfs_init(void)
122122
late_initcall(kernel_exit_sysfs_init);
123123
#endif
124124

125-
static void __unhash_process(struct task_struct *p, bool group_dead)
125+
/*
126+
* For things release_task() would like to do *after* tasklist_lock is released.
127+
*/
128+
struct release_task_post {
129+
struct pid *pids[PIDTYPE_MAX];
130+
};
131+
132+
static void __unhash_process(struct release_task_post *post, struct task_struct *p,
133+
bool group_dead)
126134
{
127135
nr_threads--;
128-
detach_pid(p, PIDTYPE_PID);
136+
detach_pid(post->pids, p, PIDTYPE_PID);
129137
if (group_dead) {
130-
detach_pid(p, PIDTYPE_TGID);
131-
detach_pid(p, PIDTYPE_PGID);
132-
detach_pid(p, PIDTYPE_SID);
138+
detach_pid(post->pids, p, PIDTYPE_TGID);
139+
detach_pid(post->pids, p, PIDTYPE_PGID);
140+
detach_pid(post->pids, p, PIDTYPE_SID);
133141

134142
list_del_rcu(&p->tasks);
135143
list_del_init(&p->sibling);
@@ -141,7 +149,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
141149
/*
142150
* This function expects the tasklist_lock write-locked.
143151
*/
144-
static void __exit_signal(struct task_struct *tsk)
152+
static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
145153
{
146154
struct signal_struct *sig = tsk->signal;
147155
bool group_dead = thread_group_leader(tsk);
@@ -194,7 +202,7 @@ static void __exit_signal(struct task_struct *tsk)
194202
task_io_accounting_add(&sig->ioac, &tsk->ioac);
195203
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
196204
sig->nr_threads--;
197-
__unhash_process(tsk, group_dead);
205+
__unhash_process(post, tsk, group_dead);
198206
write_sequnlock(&sig->stats_lock);
199207

200208
tsk->sighand = NULL;
@@ -228,10 +236,13 @@ void __weak release_thread(struct task_struct *dead_task)
228236

229237
void release_task(struct task_struct *p)
230238
{
239+
struct release_task_post post;
231240
struct task_struct *leader;
232241
struct pid *thread_pid;
233242
int zap_leader;
234243
repeat:
244+
memset(&post, 0, sizeof(post));
245+
235246
/* don't need to get the RCU readlock here - the process is dead and
236247
* can't be modifying its own credentials. But shut RCU-lockdep up */
237248
rcu_read_lock();
@@ -244,7 +255,7 @@ void release_task(struct task_struct *p)
244255

245256
write_lock_irq(&tasklist_lock);
246257
ptrace_release_task(p);
247-
__exit_signal(p);
258+
__exit_signal(&post, p);
248259

249260
/*
250261
* If we are the last non-leader member of the thread
@@ -270,6 +281,7 @@ void release_task(struct task_struct *p)
270281
put_pid(thread_pid);
271282
add_device_randomness(&p->se.sum_exec_runtime,
272283
sizeof(p->se.sum_exec_runtime));
284+
free_pids(post.pids);
273285
release_thread(p);
274286
/*
275287
* This task was already removed from the process/thread/pid lists

kernel/pid.c

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = {
8888
};
8989
EXPORT_SYMBOL_GPL(init_pid_ns);
9090

91-
/*
92-
* Note: disable interrupts while the pidmap_lock is held as an
93-
* interrupt might come in and do read_lock(&tasklist_lock).
94-
*
95-
* If we don't disable interrupts there is a nasty deadlock between
96-
* detach_pid()->free_pid() and another cpu that does
97-
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
98-
* read_lock(&tasklist_lock);
99-
*
100-
* After we clean up the tasklist_lock and know there are no
101-
* irq handlers that take it we can leave the interrupts enabled.
102-
* For now it is easier to be safe than to prove it can't happen.
103-
*/
104-
10591
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
10692
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
10793

@@ -128,10 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp)
128114

129115
void free_pid(struct pid *pid)
130116
{
131-
/* We can be called with write_lock_irq(&tasklist_lock) held */
132117
int i;
133118
unsigned long flags;
134119

120+
lockdep_assert_not_held(&tasklist_lock);
121+
135122
spin_lock_irqsave(&pidmap_lock, flags);
136123
for (i = 0; i <= pid->level; i++) {
137124
struct upid *upid = pid->numbers + i;
@@ -160,6 +147,18 @@ void free_pid(struct pid *pid)
160147
call_rcu(&pid->rcu, delayed_put_pid);
161148
}
162149

150+
void free_pids(struct pid **pids)
151+
{
152+
int tmp;
153+
154+
/*
155+
* This can batch pidmap_lock.
156+
*/
157+
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
158+
if (pids[tmp])
159+
free_pid(pids[tmp]);
160+
}
161+
163162
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
164163
size_t set_tid_size)
165164
{
@@ -347,8 +346,8 @@ void attach_pid(struct task_struct *task, enum pid_type type)
347346
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
348347
}
349348

350-
static void __change_pid(struct task_struct *task, enum pid_type type,
351-
struct pid *new)
349+
static void __change_pid(struct pid **pids, struct task_struct *task,
350+
enum pid_type type, struct pid *new)
352351
{
353352
struct pid **pid_ptr, *pid;
354353
int tmp;
@@ -370,18 +369,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
370369
if (pid_has_task(pid, tmp))
371370
return;
372371

373-
free_pid(pid);
372+
WARN_ON(pids[type]);
373+
pids[type] = pid;
374374
}
375375

376-
void detach_pid(struct task_struct *task, enum pid_type type)
376+
void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
377377
{
378-
__change_pid(task, type, NULL);
378+
__change_pid(pids, task, type, NULL);
379379
}
380380

381-
void change_pid(struct task_struct *task, enum pid_type type,
381+
void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
382382
struct pid *pid)
383383
{
384-
__change_pid(task, type, pid);
384+
__change_pid(pids, task, type, pid);
385385
attach_pid(task, type);
386386
}
387387

kernel/sys.c

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
10851085
{
10861086
struct task_struct *p;
10871087
struct task_struct *group_leader = current->group_leader;
1088+
struct pid *pids[PIDTYPE_MAX] = { 0 };
10881089
struct pid *pgrp;
10891090
int err;
10901091

@@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
11421143
goto out;
11431144

11441145
if (task_pgrp(p) != pgrp)
1145-
change_pid(p, PIDTYPE_PGID, pgrp);
1146+
change_pid(pids, p, PIDTYPE_PGID, pgrp);
11461147

11471148
err = 0;
11481149
out:
11491150
/* All paths lead to here, thus we are safe. -DaveM */
11501151
write_unlock_irq(&tasklist_lock);
11511152
rcu_read_unlock();
1153+
free_pids(pids);
11521154
return err;
11531155
}
11541156

@@ -1222,21 +1224,22 @@ SYSCALL_DEFINE1(getsid, pid_t, pid)
12221224
return retval;
12231225
}
12241226

1225-
static void set_special_pids(struct pid *pid)
1227+
static void set_special_pids(struct pid **pids, struct pid *pid)
12261228
{
12271229
struct task_struct *curr = current->group_leader;
12281230

12291231
if (task_session(curr) != pid)
1230-
change_pid(curr, PIDTYPE_SID, pid);
1232+
change_pid(pids, curr, PIDTYPE_SID, pid);
12311233

12321234
if (task_pgrp(curr) != pid)
1233-
change_pid(curr, PIDTYPE_PGID, pid);
1235+
change_pid(pids, curr, PIDTYPE_PGID, pid);
12341236
}
12351237

12361238
int ksys_setsid(void)
12371239
{
12381240
struct task_struct *group_leader = current->group_leader;
12391241
struct pid *sid = task_pid(group_leader);
1242+
struct pid *pids[PIDTYPE_MAX] = { 0 };
12401243
pid_t session = pid_vnr(sid);
12411244
int err = -EPERM;
12421245

@@ -1252,13 +1255,14 @@ int ksys_setsid(void)
12521255
goto out;
12531256

12541257
group_leader->signal->leader = 1;
1255-
set_special_pids(sid);
1258+
set_special_pids(pids, sid);
12561259

12571260
proc_clear_tty(group_leader);
12581261

12591262
err = session;
12601263
out:
12611264
write_unlock_irq(&tasklist_lock);
1265+
free_pids(pids);
12621266
if (err > 0) {
12631267
proc_sid_connector(group_leader);
12641268
sched_autogroup_create_attach(group_leader);

0 commit comments

Comments
 (0)