Skip to content

Commit 5f85bd6

Browse files
committed
Merge tag 'vfs-6.14-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull pidfs updates from Christian Brauner: - Rework inode number allocation Recently we received a patchset that aims to enable file handle encoding and decoding via name_to_handle_at(2) and open_by_handle_at(2). A crucical step in the patch series is how to go from inode number to struct pid without leaking information into unprivileged contexts. The issue is that in order to find a struct pid the pid number in the initial pid namespace must be encoded into the file handle via name_to_handle_at(2). This can be used by containers using a separate pid namespace to learn what the pid number of a given process in the initial pid namespace is. While this is a weak information leak it could be used in various exploits and in general is an ugly wart in the design. To solve this problem a new way is needed to lookup a struct pid based on the inode number allocated for that struct pid. The other part is to remove the custom inode number allocation on 32bit systems that is also an ugly wart that should go away. Allocate unique identifiers for struct pid by simply incrementing a 64 bit counter and insert each struct pid into the rbtree so it can be looked up to decode file handles avoiding to leak actual pids across pid namespaces in file handles. On both 64 bit and 32 bit the same 64 bit identifier is used to lookup struct pid in the rbtree. On 64 bit the unique identifier for struct pid simply becomes the inode number. Comparing two pidfds continues to be as simple as comparing inode numbers. On 32 bit the 64 bit number assigned to struct pid is split into two 32 bit numbers. The lower 32 bits are used as the inode number and the upper 32 bits are used as the inode generation number. Whenever a wraparound happens on 32 bit the 64 bit number will be incremented by 2 so inode numbering starts at 2 again. When a wraparound happens on 32 bit multiple pidfds with the same inode number are likely to exist. This isn't a problem since before pidfs pidfds used the anonymous inode meaning all pidfds had the same inode number. On 32 bit sserspace can thus reconstruct the 64 bit identifier by retrieving both the inode number and the inode generation number to compare, or use file handles. This gives the same guarantees on both 32 bit and 64 bit. - Implement file handle support This is based on custom export operation methods which allows pidfs to implement permission checking and opening of pidfs file handles cleanly without hacking around in the core file handle code too much. - Support bind-mounts Allow bind-mounting pidfds. Similar to nsfs let's allow bind-mounts for pidfds. This allows pidfds to be safely recovered and checked for process recycling. Instead of checking d_ops for both nsfs and pidfs we could in a follow-up patch add a flag argument to struct dentry_operations that functions similar to file_operations->fop_flags. * tag 'vfs-6.14-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests: add pidfd bind-mount tests pidfs: allow bind-mounts pidfs: lookup pid through rbtree selftests/pidfd: add pidfs file handle selftests pidfs: check for valid ioctl commands pidfs: implement file handle support exportfs: add permission method fhandle: pull CAP_DAC_READ_SEARCH check into may_decode_fh() exportfs: add open method fhandle: simplify error handling pseudofs: add support for export_ops pidfs: support FS_IOC_GETVERSION pidfs: remove 32bit inode number handling pidfs: rework inode number allocation
2 parents 4b84a4c + 3781680 commit 5f85bd6

File tree

16 files changed

+1110
-183
lines changed

16 files changed

+1110
-183
lines changed

fs/fhandle.c

Lines changed: 56 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
187187
return 0;
188188
}
189189

190-
enum handle_to_path_flags {
191-
HANDLE_CHECK_PERMS = (1 << 0),
192-
HANDLE_CHECK_SUBTREE = (1 << 1),
193-
};
194-
195-
struct handle_to_path_ctx {
196-
struct path root;
197-
enum handle_to_path_flags flags;
198-
unsigned int fh_flags;
199-
};
200-
201190
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
202191
{
203192
struct handle_to_path_ctx *ctx = context;
@@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
261250
{
262251
int handle_dwords;
263252
struct vfsmount *mnt = ctx->root.mnt;
253+
struct dentry *dentry;
264254

265255
/* change the handle size to multiple of sizeof(u32) */
266256
handle_dwords = handle->handle_bytes >> 2;
267-
path->dentry = exportfs_decode_fh_raw(mnt,
268-
(struct fid *)handle->f_handle,
269-
handle_dwords, handle->handle_type,
270-
ctx->fh_flags,
271-
vfs_dentry_acceptable, ctx);
272-
if (IS_ERR_OR_NULL(path->dentry)) {
273-
if (path->dentry == ERR_PTR(-ENOMEM))
257+
dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
258+
handle_dwords, handle->handle_type,
259+
ctx->fh_flags, vfs_dentry_acceptable,
260+
ctx);
261+
if (IS_ERR_OR_NULL(dentry)) {
262+
if (dentry == ERR_PTR(-ENOMEM))
274263
return -ENOMEM;
275264
return -ESTALE;
276265
}
266+
path->dentry = dentry;
277267
path->mnt = mntget(mnt);
278268
return 0;
279269
}
280270

281-
/*
282-
* Allow relaxed permissions of file handles if the caller has the
283-
* ability to mount the filesystem or create a bind-mount of the
284-
* provided @mountdirfd.
285-
*
286-
* In both cases the caller may be able to get an unobstructed way to
287-
* the encoded file handle. If the caller is only able to create a
288-
* bind-mount we need to verify that there are no locked mounts on top
289-
* of it that could prevent us from getting to the encoded file.
290-
*
291-
* In principle, locked mounts can prevent the caller from mounting the
292-
* filesystem but that only applies to procfs and sysfs neither of which
293-
* support decoding file handles.
294-
*/
295-
static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
296-
unsigned int o_flags)
271+
static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
272+
unsigned int o_flags)
297273
{
298274
struct path *root = &ctx->root;
299275

276+
if (capable(CAP_DAC_READ_SEARCH))
277+
return 0;
278+
300279
/*
301-
* Restrict to O_DIRECTORY to provide a deterministic API that avoids a
302-
* confusing api in the face of disconnected non-dir dentries.
280+
* Allow relaxed permissions of file handles if the caller has
281+
* the ability to mount the filesystem or create a bind-mount of
282+
* the provided @mountdirfd.
283+
*
284+
* In both cases the caller may be able to get an unobstructed
285+
* way to the encoded file handle. If the caller is only able to
286+
* create a bind-mount we need to verify that there are no
287+
* locked mounts on top of it that could prevent us from getting
288+
* to the encoded file.
289+
*
290+
* In principle, locked mounts can prevent the caller from
291+
* mounting the filesystem but that only applies to procfs and
292+
* sysfs neither of which support decoding file handles.
293+
*
294+
* Restrict to O_DIRECTORY to provide a deterministic API that
295+
* avoids a confusing api in the face of disconnected non-dir
296+
* dentries.
303297
*
304298
* There's only one dentry for each directory inode (VFS rule)...
305299
*/
306300
if (!(o_flags & O_DIRECTORY))
307-
return false;
301+
return -EPERM;
308302

309303
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
310304
ctx->flags = HANDLE_CHECK_PERMS;
@@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
314308
!has_locked_children(real_mount(root->mnt), root->dentry))
315309
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
316310
else
317-
return false;
311+
return -EPERM;
318312

319313
/* Are we able to override DAC permissions? */
320314
if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
321-
return false;
315+
return -EPERM;
322316

323317
ctx->fh_flags = EXPORT_FH_DIR_ONLY;
324-
return true;
318+
return 0;
325319
}
326320

327321
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
331325
struct file_handle f_handle;
332326
struct file_handle *handle = NULL;
333327
struct handle_to_path_ctx ctx = {};
328+
const struct export_operations *eops;
334329

335330
retval = get_path_from_fd(mountdirfd, &ctx.root);
336331
if (retval)
337332
goto out_err;
338333

339-
if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
340-
retval = -EPERM;
334+
eops = ctx.root.mnt->mnt_sb->s_export_op;
335+
if (eops && eops->permission)
336+
retval = eops->permission(&ctx, o_flags);
337+
else
338+
retval = may_decode_fh(&ctx, o_flags);
339+
if (retval)
341340
goto out_path;
342-
}
343341

344342
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
345343
retval = -EFAULT;
@@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
398396
int open_flag)
399397
{
400398
long retval = 0;
401-
struct path path;
399+
struct path path __free(path_put) = {};
402400
struct file *file;
403-
int fd;
401+
const struct export_operations *eops;
404402

405403
retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
406404
if (retval)
407405
return retval;
408406

409-
fd = get_unused_fd_flags(open_flag);
410-
if (fd < 0) {
411-
path_put(&path);
407+
CLASS(get_unused_fd, fd)(O_CLOEXEC);
408+
if (fd < 0)
412409
return fd;
413-
}
414-
file = file_open_root(&path, "", open_flag, 0);
415-
if (IS_ERR(file)) {
416-
put_unused_fd(fd);
417-
retval = PTR_ERR(file);
418-
} else {
419-
retval = fd;
420-
fd_install(fd, file);
421-
}
422-
path_put(&path);
423-
return retval;
410+
411+
eops = path.mnt->mnt_sb->s_export_op;
412+
if (eops->open)
413+
file = eops->open(&path, open_flag);
414+
else
415+
file = file_open_root(&path, "", open_flag, 0);
416+
if (IS_ERR(file))
417+
return PTR_ERR(file);
418+
419+
fd_install(fd, file);
420+
return take_fd(fd);
424421
}
425422

426423
/**

fs/libfs.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
673673
s->s_blocksize_bits = PAGE_SHIFT;
674674
s->s_magic = ctx->magic;
675675
s->s_op = ctx->ops ?: &simple_super_operations;
676+
s->s_export_op = ctx->eops;
676677
s->s_xattr = ctx->xattr;
677678
s->s_time_gran = 1;
678679
root = new_inode(s);

fs/namespace.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <linux/fs_context.h>
3333
#include <linux/shmem_fs.h>
3434
#include <linux/mnt_idmapping.h>
35+
#include <linux/pidfs.h>
3536
#include <linux/nospec.h>
3637

3738
#include "pnode.h"
@@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
27362737
if (IS_MNT_UNBINDABLE(old))
27372738
return mnt;
27382739

2739-
if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2740-
return mnt;
2740+
if (!check_mnt(old)) {
2741+
const struct dentry_operations *d_op = old_path->dentry->d_op;
2742+
2743+
if (d_op != &ns_dentry_operations &&
2744+
d_op != &pidfs_dentry_operations)
2745+
return mnt;
2746+
}
27412747

27422748
if (!recurse && has_locked_children(old, old_path->dentry))
27432749
return mnt;

0 commit comments

Comments
 (0)