Skip to content

Commit a0d4a14

Browse files
committed
Proc mount option handling is broken, and it has been since I
accidentally broke it in the middle 2016. The problem is that because we perform an internal mount of proc before user space mounts proc all of the mount options that user specifies when mounting proc are ignored. You can set those mount options with a remount but that is rather surprising. This most directly affects android which is using hidpid=2 by default. Now that the sysctl system call support has been removed, and we have settled on way of flushing proc dentries when a process exits without using proc_mnt, there is an simple and easy fix. a) Give UML mconsole it's own private mount of proc to use. b) Stop creating the internal mount of proc We still need Alexey Gladkov's full patch to get proc mount options to work inside of UML, and to be generally useful. This set of changes is just enough to get them working as well as they have in the past. If anyone sees any problem with this code please let me know. Otherwise I plan to merge these set of fixes through my tree. Link: https://lore.kernel.org/lkml/[email protected]/ Link: https://lore.kernel.org/lkml/[email protected]/ Link: https://lore.kernel.org/lkml/[email protected]/ Link: https://lore.kernel.org/lkml/[email protected]/ Fixes: e94591d ("proc: Convert proc_mount to use mount_ns.") Eric W. Biederman (4): uml: Don't consult current to find the proc_mnt in mconsole_proc uml: Create a private mount of proc for mconsole proc: Remove the now unnecessary internal mount of proc pid: Improve the comment about waiting in zap_pid_ns_processes arch/um/drivers/mconsole_kern.c | 28 +++++++++++++++++++++++++++- fs/proc/root.c | 36 ------------------------------------ include/linux/pid_namespace.h | 2 -- include/linux/proc_ns.h | 5 ----- kernel/pid.c | 8 -------- kernel/pid_namespace.c | 38 +++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 71 deletions(-)
2 parents a13ae69 + af9fe6d commit a0d4a14

File tree

6 files changed

+46
-71
lines changed

6 files changed

+46
-71
lines changed

arch/um/drivers/mconsole_kern.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
#include "mconsole_kern.h"
3737
#include <os.h>
3838

39+
static struct vfsmount *proc_mnt = NULL;
40+
3941
static int do_unlink_socket(struct notifier_block *notifier,
4042
unsigned long what, void *data)
4143
{
@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req)
123125

124126
void mconsole_proc(struct mc_request *req)
125127
{
126-
struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
128+
struct vfsmount *mnt = proc_mnt;
127129
char *buf;
128130
int len;
129131
struct file *file;
@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req)
134136
ptr += strlen("proc");
135137
ptr = skip_spaces(ptr);
136138

139+
if (!mnt) {
140+
mconsole_reply(req, "Proc not available", 1, 0);
141+
goto out;
142+
}
137143
file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
138144
if (IS_ERR(file)) {
139145
mconsole_reply(req, "Failed to open file", 1, 0);
@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req)
683689
with_console(req, stack_proc, to);
684690
}
685691

692+
static int __init mount_proc(void)
693+
{
694+
struct file_system_type *proc_fs_type;
695+
struct vfsmount *mnt;
696+
697+
proc_fs_type = get_fs_type("proc");
698+
if (!proc_fs_type)
699+
return -ENODEV;
700+
701+
mnt = kern_mount(proc_fs_type);
702+
put_filesystem(proc_fs_type);
703+
if (IS_ERR(mnt))
704+
return PTR_ERR(mnt);
705+
706+
proc_mnt = mnt;
707+
return 0;
708+
}
709+
686710
/*
687711
* Changed by mconsole_setup, which is __setup, and called before SMP is
688712
* active.
@@ -696,6 +720,8 @@ static int __init mconsole_init(void)
696720
int err;
697721
char file[UNIX_PATH_MAX];
698722

723+
mount_proc();
724+
699725
if (umid_file_name("mconsole", file, sizeof(file)))
700726
return -1;
701727
snprintf(mconsole_socket_name, sizeof(file), "%s", file);

fs/proc/root.c

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = {
292292
.subdir = RB_ROOT,
293293
.name = "/proc",
294294
};
295-
296-
int pid_ns_prepare_proc(struct pid_namespace *ns)
297-
{
298-
struct proc_fs_context *ctx;
299-
struct fs_context *fc;
300-
struct vfsmount *mnt;
301-
302-
fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
303-
if (IS_ERR(fc))
304-
return PTR_ERR(fc);
305-
306-
if (fc->user_ns != ns->user_ns) {
307-
put_user_ns(fc->user_ns);
308-
fc->user_ns = get_user_ns(ns->user_ns);
309-
}
310-
311-
ctx = fc->fs_private;
312-
if (ctx->pid_ns != ns) {
313-
put_pid_ns(ctx->pid_ns);
314-
get_pid_ns(ns);
315-
ctx->pid_ns = ns;
316-
}
317-
318-
mnt = fc_mount(fc);
319-
put_fs_context(fc);
320-
if (IS_ERR(mnt))
321-
return PTR_ERR(mnt);
322-
323-
ns->proc_mnt = mnt;
324-
return 0;
325-
}
326-
327-
void pid_ns_release_proc(struct pid_namespace *ns)
328-
{
329-
kern_unmount(ns->proc_mnt);
330-
}

include/linux/pid_namespace.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ struct pid_namespace {
3333
unsigned int level;
3434
struct pid_namespace *parent;
3535
#ifdef CONFIG_PROC_FS
36-
struct vfsmount *proc_mnt;
3736
struct dentry *proc_self;
3837
struct dentry *proc_thread_self;
3938
#endif
@@ -42,7 +41,6 @@ struct pid_namespace {
4241
#endif
4342
struct user_namespace *user_ns;
4443
struct ucounts *ucounts;
45-
struct work_struct proc_work;
4644
kgid_t pid_gid;
4745
int hide_pid;
4846
int reboot; /* group exit code if this pidns was rebooted */

include/linux/proc_ns.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,11 @@ enum {
5050

5151
#ifdef CONFIG_PROC_FS
5252

53-
extern int pid_ns_prepare_proc(struct pid_namespace *ns);
54-
extern void pid_ns_release_proc(struct pid_namespace *ns);
5553
extern int proc_alloc_inum(unsigned int *pino);
5654
extern void proc_free_inum(unsigned int inum);
5755

5856
#else /* CONFIG_PROC_FS */
5957

60-
static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
61-
static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
62-
6358
static inline int proc_alloc_inum(unsigned int *inum)
6459
{
6560
*inum = 1;

kernel/pid.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,6 @@ void free_pid(struct pid *pid)
144144
/* Handle a fork failure of the first process */
145145
WARN_ON(ns->child_reaper);
146146
ns->pid_allocated = 0;
147-
/* fall through */
148-
case 0:
149-
schedule_work(&ns->proc_work);
150147
break;
151148
}
152149

@@ -247,11 +244,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
247244
tmp = tmp->parent;
248245
}
249246

250-
if (unlikely(is_child_reaper(pid))) {
251-
if (pid_ns_prepare_proc(ns))
252-
goto out_free;
253-
}
254-
255247
get_pid_ns(ns);
256248
refcount_set(&pid->count, 1);
257249
for (type = 0; type < PIDTYPE_MAX; ++type)

kernel/pid_namespace.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
5757
return READ_ONCE(*pkc);
5858
}
5959

60-
static void proc_cleanup_work(struct work_struct *work)
61-
{
62-
struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
63-
pid_ns_release_proc(ns);
64-
}
65-
6660
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
6761
{
6862
return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
@@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
114108
ns->user_ns = get_user_ns(user_ns);
115109
ns->ucounts = ucounts;
116110
ns->pid_allocated = PIDNS_ADDING;
117-
INIT_WORK(&ns->proc_work, proc_cleanup_work);
118111

119112
return ns;
120113

@@ -231,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
231224
} while (rc != -ECHILD);
232225

233226
/*
234-
* kernel_wait4() above can't reap the EXIT_DEAD children but we do not
235-
* really care, we could reparent them to the global init. We could
236-
* exit and reap ->child_reaper even if it is not the last thread in
237-
* this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
238-
* pid_ns can not go away until proc_kill_sb() drops the reference.
227+
* kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
228+
* process whose parents processes are outside of the pid
229+
* namespace. Such processes are created with setns()+fork().
230+
*
231+
* If those EXIT_ZOMBIE processes are not reaped by their
232+
* parents before their parents exit, they will be reparented
233+
* to pid_ns->child_reaper. Thus pidns->child_reaper needs to
234+
* stay valid until they all go away.
235+
*
236+
* The code relies on the the pid_ns->child_reaper ignoring
237+
* SIGCHILD to cause those EXIT_ZOMBIE processes to be
238+
* autoreaped if reparented.
239239
*
240-
* But this ns can also have other tasks injected by setns()+fork().
241-
* Again, ignoring the user visible semantics we do not really need
242-
* to wait until they are all reaped, but they can be reparented to
243-
* us and thus we need to ensure that pid->child_reaper stays valid
244-
* until they all go away. See free_pid()->wake_up_process().
240+
* Semantically it is also desirable to wait for EXIT_ZOMBIE
241+
* processes before allowing the child_reaper to be reaped, as
242+
* that gives the invariant that when the init process of a
243+
* pid namespace is reaped all of the processes in the pid
244+
* namespace are gone.
245245
*
246-
* We rely on ignored SIGCHLD, an injected zombie must be autoreaped
247-
* if reparented.
246+
* Once all of the other tasks are gone from the pid_namespace
247+
* free_pid() will awaken this task.
248248
*/
249249
for (;;) {
250250
set_current_state(TASK_INTERRUPTIBLE);

0 commit comments

Comments
 (0)