Skip to content

Commit d987ca1

Browse files
committed
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull exec/proc updates from Eric Biederman: "This contains two significant pieces of work: the work to sort out proc_flush_task, and the work to solve a deadlock between strace and exec. Fixing proc_flush_task so that it no longer requires a persistent mount makes improvements to proc possible. The removal of the persistent mount solves an old regression that that caused the hidepid mount option to only work on remount not on mount. The regression was found and reported by the Android folks. This further allows Alexey Gladkov's work making proc mount options specific to an individual mount of proc to move forward. The work on exec starts solving a long standing issue with exec that it takes mutexes of blocking userspace applications, which makes exec extremely deadlock prone. For the moment this adds a second mutex with a narrower scope that handles all of the easy cases. Which makes the tricky cases easy to spot. With a little luck the code to solve those deadlocks will be ready by next merge window" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (25 commits) signal: Extend exec_id to 64bits pidfd: Use new infrastructure to fix deadlocks in execve perf: Use new infrastructure to fix deadlocks in execve proc: io_accounting: Use new infrastructure to fix deadlocks in execve proc: Use new infrastructure to fix deadlocks in execve kernel/kcmp.c: Use new infrastructure to fix deadlocks in execve kernel: doc: remove outdated comment cred.c mm: docs: Fix a comment in process_vm_rw_core selftests/ptrace: add test cases for dead-locks exec: Fix a deadlock in strace exec: Add exec_update_mutex to replace cred_guard_mutex exec: Move exec_mmap right after de_thread in flush_old_exec exec: Move cleanup of posix timers on exec out of de_thread exec: Factor unshare_sighand out of de_thread and call it separately exec: Only compute current once in flush_old_exec pid: Improve the comment about waiting in zap_pid_ns_processes proc: Remove the now unnecessary internal mount of proc uml: Create a private mount of proc for mconsole uml: Don't consult current to find the proc_mnt in mconsole_proc proc: Use a list of inodes to flush from proc ...
2 parents 919dce2 + d1e7fd6 commit d987ca1

File tree

26 files changed

+349
-248
lines changed

26 files changed

+349
-248
lines changed

arch/um/drivers/mconsole_kern.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
#include "mconsole_kern.h"
3737
#include <os.h>
3838

39+
static struct vfsmount *proc_mnt = NULL;
40+
3941
static int do_unlink_socket(struct notifier_block *notifier,
4042
unsigned long what, void *data)
4143
{
@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req)
123125

124126
void mconsole_proc(struct mc_request *req)
125127
{
126-
struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
128+
struct vfsmount *mnt = proc_mnt;
127129
char *buf;
128130
int len;
129131
struct file *file;
@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req)
134136
ptr += strlen("proc");
135137
ptr = skip_spaces(ptr);
136138

139+
if (!mnt) {
140+
mconsole_reply(req, "Proc not available", 1, 0);
141+
goto out;
142+
}
137143
file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
138144
if (IS_ERR(file)) {
139145
mconsole_reply(req, "Failed to open file", 1, 0);
@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req)
683689
with_console(req, stack_proc, to);
684690
}
685691

692+
static int __init mount_proc(void)
693+
{
694+
struct file_system_type *proc_fs_type;
695+
struct vfsmount *mnt;
696+
697+
proc_fs_type = get_fs_type("proc");
698+
if (!proc_fs_type)
699+
return -ENODEV;
700+
701+
mnt = kern_mount(proc_fs_type);
702+
put_filesystem(proc_fs_type);
703+
if (IS_ERR(mnt))
704+
return PTR_ERR(mnt);
705+
706+
proc_mnt = mnt;
707+
return 0;
708+
}
709+
686710
/*
687711
* Changed by mconsole_setup, which is __setup, and called before SMP is
688712
* active.
@@ -696,6 +720,8 @@ static int __init mconsole_init(void)
696720
int err;
697721
char file[UNIX_PATH_MAX];
698722

723+
mount_proc();
724+
699725
if (umid_file_name("mconsole", file, sizeof(file)))
700726
return -1;
701727
snprintf(mconsole_socket_name, sizeof(file), "%s", file);

fs/exec.c

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,16 +1036,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
10361036
}
10371037
EXPORT_SYMBOL(read_code);
10381038

1039+
/*
1040+
* Maps the mm_struct mm into the current task struct.
1041+
* On success, this function returns with the mutex
1042+
* exec_update_mutex locked.
1043+
*/
10391044
static int exec_mmap(struct mm_struct *mm)
10401045
{
10411046
struct task_struct *tsk;
10421047
struct mm_struct *old_mm, *active_mm;
1048+
int ret;
10431049

10441050
/* Notify parent that we're no longer interested in the old VM */
10451051
tsk = current;
10461052
old_mm = current->mm;
10471053
exec_mm_release(tsk, old_mm);
10481054

1055+
ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
1056+
if (ret)
1057+
return ret;
1058+
10491059
if (old_mm) {
10501060
sync_mm_rss(old_mm);
10511061
/*
@@ -1057,9 +1067,11 @@ static int exec_mmap(struct mm_struct *mm)
10571067
down_read(&old_mm->mmap_sem);
10581068
if (unlikely(old_mm->core_state)) {
10591069
up_read(&old_mm->mmap_sem);
1070+
mutex_unlock(&tsk->signal->exec_update_mutex);
10601071
return -EINTR;
10611072
}
10621073
}
1074+
10631075
task_lock(tsk);
10641076
active_mm = tsk->active_mm;
10651077
membarrier_exec_mmap(mm);
@@ -1215,10 +1227,22 @@ static int de_thread(struct task_struct *tsk)
12151227
/* we have changed execution domain */
12161228
tsk->exit_signal = SIGCHLD;
12171229

1218-
#ifdef CONFIG_POSIX_TIMERS
1219-
exit_itimers(sig);
1220-
flush_itimer_signals();
1221-
#endif
1230+
BUG_ON(!thread_group_leader(tsk));
1231+
return 0;
1232+
1233+
killed:
1234+
/* protects against exit_notify() and __exit_signal() */
1235+
read_lock(&tasklist_lock);
1236+
sig->group_exit_task = NULL;
1237+
sig->notify_count = 0;
1238+
read_unlock(&tasklist_lock);
1239+
return -EAGAIN;
1240+
}
1241+
1242+
1243+
static int unshare_sighand(struct task_struct *me)
1244+
{
1245+
struct sighand_struct *oldsighand = me->sighand;
12221246

12231247
if (refcount_read(&oldsighand->count) != 1) {
12241248
struct sighand_struct *newsighand;
@@ -1236,23 +1260,13 @@ static int de_thread(struct task_struct *tsk)
12361260

12371261
write_lock_irq(&tasklist_lock);
12381262
spin_lock(&oldsighand->siglock);
1239-
rcu_assign_pointer(tsk->sighand, newsighand);
1263+
rcu_assign_pointer(me->sighand, newsighand);
12401264
spin_unlock(&oldsighand->siglock);
12411265
write_unlock_irq(&tasklist_lock);
12421266

12431267
__cleanup_sighand(oldsighand);
12441268
}
1245-
1246-
BUG_ON(!thread_group_leader(tsk));
12471269
return 0;
1248-
1249-
killed:
1250-
/* protects against exit_notify() and __exit_signal() */
1251-
read_lock(&tasklist_lock);
1252-
sig->group_exit_task = NULL;
1253-
sig->notify_count = 0;
1254-
read_unlock(&tasklist_lock);
1255-
return -EAGAIN;
12561270
}
12571271

12581272
char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
@@ -1286,13 +1300,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
12861300
*/
12871301
int flush_old_exec(struct linux_binprm * bprm)
12881302
{
1303+
struct task_struct *me = current;
12891304
int retval;
12901305

12911306
/*
1292-
* Make sure we have a private signal table and that
1293-
* we are unassociated from the previous thread group.
1307+
* Make this the only thread in the thread group.
12941308
*/
1295-
retval = de_thread(current);
1309+
retval = de_thread(me);
12961310
if (retval)
12971311
goto out;
12981312

@@ -1312,26 +1326,39 @@ int flush_old_exec(struct linux_binprm * bprm)
13121326
goto out;
13131327

13141328
/*
1315-
* After clearing bprm->mm (to mark that current is using the
1316-
* prepared mm now), we have nothing left of the original
1329+
* After setting bprm->called_exec_mmap (to mark that current is
1330+
* using the prepared mm now), we have nothing left of the original
13171331
* process. If anything from here on returns an error, the check
13181332
* in search_binary_handler() will SEGV current.
13191333
*/
1334+
bprm->called_exec_mmap = 1;
13201335
bprm->mm = NULL;
13211336

1337+
#ifdef CONFIG_POSIX_TIMERS
1338+
exit_itimers(me->signal);
1339+
flush_itimer_signals();
1340+
#endif
1341+
1342+
/*
1343+
* Make the signal table private.
1344+
*/
1345+
retval = unshare_sighand(me);
1346+
if (retval)
1347+
goto out;
1348+
13221349
set_fs(USER_DS);
1323-
current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1350+
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
13241351
PF_NOFREEZE | PF_NO_SETAFFINITY);
13251352
flush_thread();
1326-
current->personality &= ~bprm->per_clear;
1353+
me->personality &= ~bprm->per_clear;
13271354

13281355
/*
13291356
* We have to apply CLOEXEC before we change whether the process is
13301357
* dumpable (in setup_new_exec) to avoid a race with a process in userspace
13311358
* trying to access the should-be-closed file descriptors of a process
13321359
* undergoing exec(2).
13331360
*/
1334-
do_close_on_exec(current->files);
1361+
do_close_on_exec(me->files);
13351362
return 0;
13361363

13371364
out:
@@ -1412,7 +1439,7 @@ void setup_new_exec(struct linux_binprm * bprm)
14121439

14131440
/* An exec changes our domain. We are no longer part of the thread
14141441
group */
1415-
current->self_exec_id++;
1442+
WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
14161443
flush_signal_handlers(current, 0);
14171444
}
14181445
EXPORT_SYMBOL(setup_new_exec);
@@ -1450,6 +1477,8 @@ static void free_bprm(struct linux_binprm *bprm)
14501477
{
14511478
free_arg_pages(bprm);
14521479
if (bprm->cred) {
1480+
if (bprm->called_exec_mmap)
1481+
mutex_unlock(&current->signal->exec_update_mutex);
14531482
mutex_unlock(&current->signal->cred_guard_mutex);
14541483
abort_creds(bprm->cred);
14551484
}
@@ -1499,6 +1528,7 @@ void install_exec_creds(struct linux_binprm *bprm)
14991528
* credentials; any time after this it may be unlocked.
15001529
*/
15011530
security_bprm_committed_creds(bprm);
1531+
mutex_unlock(&current->signal->exec_update_mutex);
15021532
mutex_unlock(&current->signal->cred_guard_mutex);
15031533
}
15041534
EXPORT_SYMBOL(install_exec_creds);
@@ -1690,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm)
16901720

16911721
read_lock(&binfmt_lock);
16921722
put_binfmt(fmt);
1693-
if (retval < 0 && !bprm->mm) {
1723+
if (retval < 0 && bprm->called_exec_mmap) {
16941724
/* we got to flush_old_exec() and failed after it */
16951725
read_unlock(&binfmt_lock);
16961726
force_sigsegv(SIGSEGV);

0 commit comments

Comments
 (0)