Skip to content

Commit ca567df

Browse files
committed
nsfs: add pid translation ioctls
Add ioctl()s to translate pids between pid namespaces. LXCFS is a tiny fuse filesystem used to virtualize various aspects of procfs. LXCFS is run on the host. The files and directories it creates can be bind-mounted by e.g. a container at startup and mounted over the various procfs files the container wishes to have virtualized. When e.g. a read request for uptime is received, LXCFS will receive the pid of the reader. In order to virtualize the corresponding read, LXCFS needs to know the pid of the init process of the reader's pid namespace. In order to do this, LXCFS first needs to fork() two helper processes. The first helper process setns() to the readers pid namespace. The second helper process is needed to create a process that is a proper member of the pid namespace. The second helper process then creates a ucred message with ucred.pid set to 1 and sends it back to LXCFS. The kernel will translate the ucred.pid field to the corresponding pid number in LXCFS's pid namespace. This way LXCFS can learn the init pid number of the reader's pid namespace and can go on to virtualize. Since these two forks() are costly LXCFS maintains an init pid cache that caches a given pid for a fixed amount of time. The cache is pruned during new read requests. However, even with the cache the hit of the two forks() is singificant when a very large number of containers are running. With this simple patch we add an ns ioctl that let's a caller retrieve the init pid nr of a pid namespace through its pid namespace fd. This significantly improves performance with a very simple change. Support translation of pids and tgids. Other concepts can be added but there are no obvious users for this right now. To protect against races pidfds can be used to check whether the process is still valid. If needed, this can also be extended to work on pidfds directly. Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Alexander Mikhalitsyn <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 1613e60 commit ca567df

File tree

2 files changed

+60
-1
lines changed

2 files changed

+60
-1
lines changed

fs/nsfs.c

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <linux/magic.h>
99
#include <linux/ktime.h>
1010
#include <linux/seq_file.h>
11+
#include <linux/pid_namespace.h>
1112
#include <linux/user_namespace.h>
1213
#include <linux/nsfs.h>
1314
#include <linux/uaccess.h>
@@ -123,9 +124,12 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
123124
unsigned long arg)
124125
{
125126
struct user_namespace *user_ns;
127+
struct pid_namespace *pid_ns;
128+
struct task_struct *tsk;
126129
struct ns_common *ns = get_proc_ns(file_inode(filp));
127130
uid_t __user *argp;
128131
uid_t uid;
132+
int ret;
129133

130134
switch (ioctl) {
131135
case NS_GET_USERNS:
@@ -143,9 +147,56 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
143147
argp = (uid_t __user *) arg;
144148
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
145149
return put_user(uid, argp);
150+
case NS_GET_PID_FROM_PIDNS:
151+
fallthrough;
152+
case NS_GET_TGID_FROM_PIDNS:
153+
fallthrough;
154+
case NS_GET_PID_IN_PIDNS:
155+
fallthrough;
156+
case NS_GET_TGID_IN_PIDNS:
157+
if (ns->ops->type != CLONE_NEWPID)
158+
return -EINVAL;
159+
160+
ret = -ESRCH;
161+
pid_ns = container_of(ns, struct pid_namespace, ns);
162+
163+
rcu_read_lock();
164+
165+
if (ioctl == NS_GET_PID_IN_PIDNS ||
166+
ioctl == NS_GET_TGID_IN_PIDNS)
167+
tsk = find_task_by_vpid(arg);
168+
else
169+
tsk = find_task_by_pid_ns(arg, pid_ns);
170+
if (!tsk)
171+
break;
172+
173+
switch (ioctl) {
174+
case NS_GET_PID_FROM_PIDNS:
175+
ret = task_pid_vnr(tsk);
176+
break;
177+
case NS_GET_TGID_FROM_PIDNS:
178+
ret = task_tgid_vnr(tsk);
179+
break;
180+
case NS_GET_PID_IN_PIDNS:
181+
ret = task_pid_nr_ns(tsk, pid_ns);
182+
break;
183+
case NS_GET_TGID_IN_PIDNS:
184+
ret = task_tgid_nr_ns(tsk, pid_ns);
185+
break;
186+
default:
187+
ret = 0;
188+
break;
189+
}
190+
rcu_read_unlock();
191+
192+
if (!ret)
193+
ret = -ESRCH;
194+
break;
146195
default:
147-
return -ENOTTY;
196+
ret = -ENOTTY;
148197
}
198+
199+
return ret;
149200
}
150201

151202
int ns_get_name(char *buf, size_t size, struct task_struct *task,

include/uapi/linux/nsfs.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,13 @@
1515
#define NS_GET_NSTYPE _IO(NSIO, 0x3)
1616
/* Get owner UID (in the caller's user namespace) for a user namespace */
1717
#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
18+
/* Translate pid from target pid namespace into the caller's pid namespace. */
19+
#define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x5, int)
20+
/* Return thread-group leader id of pid in the callers pid namespace. */
21+
#define NS_GET_TGID_FROM_PIDNS _IOR(NSIO, 0x7, int)
22+
/* Translate pid from caller's pid namespace into a target pid namespace. */
23+
#define NS_GET_PID_IN_PIDNS _IOR(NSIO, 0x6, int)
24+
/* Return thread-group leader id of pid in the target pid namespace. */
25+
#define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x8, int)
1826

1927
#endif /* __LINUX_NSFS_H */

0 commit comments

Comments
 (0)