Skip to content

Commit fa10fed

Browse files
legionusebiederm
authored andcommitted
proc: allow to mount many instances of proc in one pid namespace
This patch allows to have multiple procfs instances inside the same pid namespace. The aim here is lightweight sandboxes, and to allow that we have to modernize procfs internals. 1) The main aim of this work is to have on embedded systems one supervisor for apps. Right now we have some lightweight sandbox support, however if we create pid namespacess we have to manages all the processes inside too, where our goal is to be able to run a bunch of apps each one inside its own mount namespace without being able to notice each other. We only want to use mount namespaces, and we want procfs to behave more like a real mount point. 2) Linux Security Modules have multiple ptrace paths inside some subsystems, however inside procfs, the implementation does not guarantee that the ptrace() check which triggers the security_ptrace_check() hook will always run. We have the 'hidepid' mount option that can be used to force the ptrace_may_access() check inside has_pid_permissions() to run. The problem is that 'hidepid' is per pid namespace and not attached to the mount point, any remount or modification of 'hidepid' will propagate to all other procfs mounts. This also does not allow to support Yama LSM easily in desktop and user sessions. Yama ptrace scope which restricts ptrace and some other syscalls to be allowed only on inferiors, can be updated to have a per-task context, where the context will be inherited during fork(), clone() and preserved across execve(). If we support multiple private procfs instances, then we may force the ptrace_may_access() on /proc/<pids>/ to always run inside that new procfs instances. This will allow to specifiy on user sessions if we should populate procfs with pids that the user can ptrace or not. By using Yama ptrace scope, some restricted users will only be able to see inferiors inside /proc, they won't even be able to see their other processes. Some software like Chromium, Firefox's crash handler, Wine and others are already using Yama to restrict which processes can be ptracable. With this change this will give the possibility to restrict /proc/<pids>/ but more importantly this will give desktop users a generic and usuable way to specifiy which users should see all processes and which users can not. Side notes: * This covers the lack of seccomp where it is not able to parse arguments, it is easy to install a seccomp filter on direct syscalls that operate on pids, however /proc/<pid>/ is a Linux ABI using filesystem syscalls. With this change LSMs should be able to analyze open/read/write/close... In the new patch set version I removed the 'newinstance' option as suggested by Eric W. Biederman. Selftest has been added to verify new behavior. Signed-off-by: Alexey Gladkov <[email protected]> Reviewed-by: Alexey Dobriyan <[email protected]> Reviewed-by: Kees Cook <[email protected]> Signed-off-by: Eric W. Biederman <[email protected]>
1 parent 1e88c42 commit fa10fed

File tree

10 files changed

+124
-63
lines changed

10 files changed

+124
-63
lines changed

fs/proc/base.c

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -697,32 +697,32 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
697697
* May current process learn task's sched/cmdline info (for hide_pid_min=1)
698698
* or euid/egid (for hide_pid_min=2)?
699699
*/
700-
static bool has_pid_permissions(struct pid_namespace *pid,
700+
static bool has_pid_permissions(struct proc_fs_info *fs_info,
701701
struct task_struct *task,
702702
int hide_pid_min)
703703
{
704-
if (pid->hide_pid < hide_pid_min)
704+
if (fs_info->hide_pid < hide_pid_min)
705705
return true;
706-
if (in_group_p(pid->pid_gid))
706+
if (in_group_p(fs_info->pid_gid))
707707
return true;
708708
return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
709709
}
710710

711711

712712
static int proc_pid_permission(struct inode *inode, int mask)
713713
{
714-
struct pid_namespace *pid = proc_pid_ns(inode);
714+
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
715715
struct task_struct *task;
716716
bool has_perms;
717717

718718
task = get_proc_task(inode);
719719
if (!task)
720720
return -ESRCH;
721-
has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
721+
has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
722722
put_task_struct(task);
723723

724724
if (!has_perms) {
725-
if (pid->hide_pid == HIDEPID_INVISIBLE) {
725+
if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
726726
/*
727727
* Let's make getdents(), stat(), and open()
728728
* consistent with each other. If a process
@@ -1897,7 +1897,7 @@ int pid_getattr(const struct path *path, struct kstat *stat,
18971897
u32 request_mask, unsigned int query_flags)
18981898
{
18991899
struct inode *inode = d_inode(path->dentry);
1900-
struct pid_namespace *pid = proc_pid_ns(inode);
1900+
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
19011901
struct task_struct *task;
19021902

19031903
generic_fillattr(inode, stat);
@@ -1907,7 +1907,7 @@ int pid_getattr(const struct path *path, struct kstat *stat,
19071907
rcu_read_lock();
19081908
task = pid_task(proc_pid(inode), PIDTYPE_PID);
19091909
if (task) {
1910-
if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
1910+
if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
19111911
rcu_read_unlock();
19121912
/*
19131913
* This doesn't prevent learning whether PID exists,
@@ -3301,14 +3301,16 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
33013301
{
33023302
struct task_struct *task;
33033303
unsigned tgid;
3304+
struct proc_fs_info *fs_info;
33043305
struct pid_namespace *ns;
33053306
struct dentry *result = ERR_PTR(-ENOENT);
33063307

33073308
tgid = name_to_int(&dentry->d_name);
33083309
if (tgid == ~0U)
33093310
goto out;
33103311

3311-
ns = dentry->d_sb->s_fs_info;
3312+
fs_info = proc_sb_info(dentry->d_sb);
3313+
ns = fs_info->pid_ns;
33123314
rcu_read_lock();
33133315
task = find_task_by_pid_ns(tgid, ns);
33143316
if (task)
@@ -3372,20 +3374,21 @@ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter ite
33723374
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
33733375
{
33743376
struct tgid_iter iter;
3377+
struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
33753378
struct pid_namespace *ns = proc_pid_ns(file_inode(file));
33763379
loff_t pos = ctx->pos;
33773380

33783381
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
33793382
return 0;
33803383

33813384
if (pos == TGID_OFFSET - 2) {
3382-
struct inode *inode = d_inode(ns->proc_self);
3385+
struct inode *inode = d_inode(fs_info->proc_self);
33833386
if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
33843387
return 0;
33853388
ctx->pos = pos = pos + 1;
33863389
}
33873390
if (pos == TGID_OFFSET - 1) {
3388-
struct inode *inode = d_inode(ns->proc_thread_self);
3391+
struct inode *inode = d_inode(fs_info->proc_thread_self);
33893392
if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
33903393
return 0;
33913394
ctx->pos = pos = pos + 1;
@@ -3399,7 +3402,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
33993402
unsigned int len;
34003403

34013404
cond_resched();
3402-
if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
3405+
if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
34033406
continue;
34043407

34053408
len = snprintf(name, sizeof(name), "%u", iter.tgid);
@@ -3599,6 +3602,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
35993602
struct task_struct *task;
36003603
struct task_struct *leader = get_proc_task(dir);
36013604
unsigned tid;
3605+
struct proc_fs_info *fs_info;
36023606
struct pid_namespace *ns;
36033607
struct dentry *result = ERR_PTR(-ENOENT);
36043608

@@ -3609,7 +3613,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
36093613
if (tid == ~0U)
36103614
goto out;
36113615

3612-
ns = dentry->d_sb->s_fs_info;
3616+
fs_info = proc_sb_info(dentry->d_sb);
3617+
ns = fs_info->pid_ns;
36133618
rcu_read_lock();
36143619
task = find_task_by_pid_ns(tid, ns);
36153620
if (task)

fs/proc/inode.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,12 @@ void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock
167167

168168
static int proc_show_options(struct seq_file *seq, struct dentry *root)
169169
{
170-
struct super_block *sb = root->d_sb;
171-
struct pid_namespace *pid = sb->s_fs_info;
170+
struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);
172171

173-
if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
174-
seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
175-
if (pid->hide_pid != HIDEPID_OFF)
176-
seq_printf(seq, ",hidepid=%u", pid->hide_pid);
172+
if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
173+
seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
174+
if (fs_info->hide_pid != HIDEPID_OFF)
175+
seq_printf(seq, ",hidepid=%u", fs_info->hide_pid);
177176

178177
return 0;
179178
}

fs/proc/root.c

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -77,26 +77,31 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
7777
return 0;
7878
}
7979

80-
static void proc_apply_options(struct super_block *s,
80+
static void proc_apply_options(struct proc_fs_info *fs_info,
8181
struct fs_context *fc,
82-
struct pid_namespace *pid_ns,
8382
struct user_namespace *user_ns)
8483
{
8584
struct proc_fs_context *ctx = fc->fs_private;
8685

8786
if (ctx->mask & (1 << Opt_gid))
88-
pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
87+
fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
8988
if (ctx->mask & (1 << Opt_hidepid))
90-
pid_ns->hide_pid = ctx->hidepid;
89+
fs_info->hide_pid = ctx->hidepid;
9190
}
9291

9392
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
9493
{
95-
struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
94+
struct proc_fs_context *ctx = fc->fs_private;
9695
struct inode *root_inode;
96+
struct proc_fs_info *fs_info;
9797
int ret;
9898

99-
proc_apply_options(s, fc, pid_ns, current_user_ns());
99+
fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL);
100+
if (!fs_info)
101+
return -ENOMEM;
102+
103+
fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
104+
proc_apply_options(fs_info, fc, current_user_ns());
100105

101106
/* User space would break if executables or devices appear on proc */
102107
s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
@@ -106,14 +111,15 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
106111
s->s_magic = PROC_SUPER_MAGIC;
107112
s->s_op = &proc_sops;
108113
s->s_time_gran = 1;
114+
s->s_fs_info = fs_info;
109115

110116
/*
111117
* procfs isn't actually a stacking filesystem; however, there is
112118
* too much magic going on inside it to permit stacking things on
113119
* top of it
114120
*/
115121
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
116-
122+
117123
/* procfs dentries and inodes don't require IO to create */
118124
s->s_shrink.seeks = 0;
119125

@@ -140,19 +146,17 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
140146
static int proc_reconfigure(struct fs_context *fc)
141147
{
142148
struct super_block *sb = fc->root->d_sb;
143-
struct pid_namespace *pid = sb->s_fs_info;
149+
struct proc_fs_info *fs_info = proc_sb_info(sb);
144150

145151
sync_filesystem(sb);
146152

147-
proc_apply_options(sb, fc, pid, current_user_ns());
153+
proc_apply_options(fs_info, fc, current_user_ns());
148154
return 0;
149155
}
150156

151157
static int proc_get_tree(struct fs_context *fc)
152158
{
153-
struct proc_fs_context *ctx = fc->fs_private;
154-
155-
return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns);
159+
return get_tree_nodev(fc, proc_fill_super);
156160
}
157161

158162
static void proc_fs_context_free(struct fs_context *fc)
@@ -188,22 +192,17 @@ static int proc_init_fs_context(struct fs_context *fc)
188192

189193
static void proc_kill_sb(struct super_block *sb)
190194
{
191-
struct pid_namespace *ns;
195+
struct proc_fs_info *fs_info = proc_sb_info(sb);
192196

193-
ns = (struct pid_namespace *)sb->s_fs_info;
194-
if (ns->proc_self)
195-
dput(ns->proc_self);
196-
if (ns->proc_thread_self)
197-
dput(ns->proc_thread_self);
198-
kill_anon_super(sb);
197+
if (fs_info->proc_self)
198+
dput(fs_info->proc_self);
199199

200-
/* Make the pid namespace safe for the next mount of proc */
201-
ns->proc_self = NULL;
202-
ns->proc_thread_self = NULL;
203-
ns->pid_gid = GLOBAL_ROOT_GID;
204-
ns->hide_pid = 0;
200+
if (fs_info->proc_thread_self)
201+
dput(fs_info->proc_thread_self);
205202

206-
put_pid_ns(ns);
203+
kill_anon_super(sb);
204+
put_pid_ns(fs_info->pid_ns);
205+
kfree(fs_info);
207206
}
208207

209208
static struct file_system_type proc_fs_type = {

fs/proc/self.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ static unsigned self_inum __ro_after_init;
3636
int proc_setup_self(struct super_block *s)
3737
{
3838
struct inode *root_inode = d_inode(s->s_root);
39-
struct pid_namespace *ns = proc_pid_ns(root_inode);
39+
struct proc_fs_info *fs_info = proc_sb_info(s);
4040
struct dentry *self;
4141
int ret = -ENOMEM;
42-
42+
4343
inode_lock(root_inode);
4444
self = d_alloc_name(s->s_root, "self");
4545
if (self) {
@@ -62,7 +62,7 @@ int proc_setup_self(struct super_block *s)
6262
if (ret)
6363
pr_err("proc_fill_super: can't allocate /proc/self\n");
6464
else
65-
ns->proc_self = self;
65+
fs_info->proc_self = self;
6666

6767
return ret;
6868
}

fs/proc/thread_self.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init;
3636
int proc_setup_thread_self(struct super_block *s)
3737
{
3838
struct inode *root_inode = d_inode(s->s_root);
39-
struct pid_namespace *ns = proc_pid_ns(root_inode);
39+
struct proc_fs_info *fs_info = proc_sb_info(s);
4040
struct dentry *thread_self;
4141
int ret = -ENOMEM;
4242

@@ -60,9 +60,9 @@ int proc_setup_thread_self(struct super_block *s)
6060
inode_unlock(root_inode);
6161

6262
if (ret)
63-
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
63+
pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
6464
else
65-
ns->proc_thread_self = thread_self;
65+
fs_info->proc_thread_self = thread_self;
6666

6767
return ret;
6868
}

include/linux/pid_namespace.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,6 @@
1717

1818
struct fs_pin;
1919

20-
enum { /* definitions for pid_namespace's hide_pid field */
21-
HIDEPID_OFF = 0,
22-
HIDEPID_NO_ACCESS = 1,
23-
HIDEPID_INVISIBLE = 2,
24-
};
25-
2620
struct pid_namespace {
2721
struct kref kref;
2822
struct idr idr;
@@ -32,17 +26,11 @@ struct pid_namespace {
3226
struct kmem_cache *pid_cachep;
3327
unsigned int level;
3428
struct pid_namespace *parent;
35-
#ifdef CONFIG_PROC_FS
36-
struct dentry *proc_self;
37-
struct dentry *proc_thread_self;
38-
#endif
3929
#ifdef CONFIG_BSD_PROCESS_ACCT
4030
struct fs_pin *bacct;
4131
#endif
4232
struct user_namespace *user_ns;
4333
struct ucounts *ucounts;
44-
kgid_t pid_gid;
45-
int hide_pid;
4634
int reboot; /* group exit code if this pidns was rebooted */
4735
struct ns_common ns;
4836
} __randomize_layout;

include/linux/proc_fs.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,26 @@ struct proc_ops {
4242
unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
4343
} __randomize_layout;
4444

45+
/* definitions for hide_pid field */
46+
enum {
47+
HIDEPID_OFF = 0,
48+
HIDEPID_NO_ACCESS = 1,
49+
HIDEPID_INVISIBLE = 2,
50+
};
51+
52+
struct proc_fs_info {
53+
struct pid_namespace *pid_ns;
54+
struct dentry *proc_self; /* For /proc/self */
55+
struct dentry *proc_thread_self; /* For /proc/thread-self */
56+
kgid_t pid_gid;
57+
int hide_pid;
58+
};
59+
60+
static inline struct proc_fs_info *proc_sb_info(struct super_block *sb)
61+
{
62+
return sb->s_fs_info;
63+
}
64+
4565
#ifdef CONFIG_PROC_FS
4666

4767
typedef int (*proc_write_t)(struct file *, char *, size_t);
@@ -176,7 +196,7 @@ int open_related_ns(struct ns_common *ns,
176196
/* get the associated pid namespace for a file in procfs */
177197
static inline struct pid_namespace *proc_pid_ns(const struct inode *inode)
178198
{
179-
return inode->i_sb->s_fs_info;
199+
return proc_sb_info(inode->i_sb)->pid_ns;
180200
}
181201

182202
#endif /* _LINUX_PROC_FS_H */

tools/testing/selftests/proc/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
/fd-002-posix-eq
44
/fd-003-kthread
55
/proc-loadavg-001
6+
/proc-multiple-procfs
67
/proc-pid-vm
78
/proc-self-map-files-001
89
/proc-self-map-files-002

tools/testing/selftests/proc/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ TEST_GEN_PROGS += self
1919
TEST_GEN_PROGS += setns-dcache
2020
TEST_GEN_PROGS += setns-sysvipc
2121
TEST_GEN_PROGS += thread-self
22+
TEST_GEN_PROGS += proc-multiple-procfs
2223

2324
include ../lib.mk

0 commit comments

Comments
 (0)