Skip to content

Commit 923ea4d

Browse files
committed
Merge patch series "net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid"
Christian Brauner <[email protected]> says: SO_PEERPIDFD currently doesn't support handing out pidfds if the sk->sk_peer_pid thread-group leader has already been reaped. In this case it currently returns EINVAL. Userspace still wants to get a pidfd for a reaped process to have a stable handle it can pass on. This is especially useful now that it is possible to retrieve exit information through a pidfd via the PIDFD_GET_INFO ioctl()'s PIDFD_INFO_EXIT flag. Another summary has been provided by David in [1]: > A pidfd can outlive the task it refers to, and thus user-space must > already be prepared that the task underlying a pidfd is gone at the time > they get their hands on the pidfd. For instance, resolving the pidfd to > a PID via the fdinfo must be prepared to read `-1`. > > Despite user-space knowing that a pidfd might be stale, several kernel > APIs currently add another layer that checks for this. In particular, > SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped, > but returns a stale pidfd if the task is reaped immediately after the > respective alive-check. > > This has the unfortunate effect that user-space now has two ways to > check for the exact same scenario: A syscall might return > EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no > particular reason to distinguish both cases. This also propagates > through user-space APIs, which pass on pidfds. They must be prepared to > pass on `-1` *or* the pidfd, because there is no guaranteed way to get a > stale pidfd from the kernel. > Userspace must already deal with a pidfd referring to a reaped task as > the task may exit and get reaped at any time will there are still many > pidfds referring to it. In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure that PIDFD_INFO_EXIT information is available whenever a pidfd for a reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped pidfds are only handed out if it is guaranteed that the caller sees the exit information: TEST_F(pidfd_info, success_reaped) { struct pidfd_info info = { .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, }; /* * Process has already been reaped and PIDFD_INFO_EXIT been set. * Verify that we can retrieve the exit status of the process. */ ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); ASSERT_TRUE(WIFEXITED(info.exit_code)); ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); } To hand out pidfds for reaped processes we thus allocate a pidfs entry for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is stashed and drop it when the socket is destroyed. This guarantees that exit information will always be recorded for the sk->sk_peer_pid task and we can hand out pidfds for reaped processes. * patches from https://lore.kernel.org/[email protected]: net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid pidfs: get rid of __pidfd_prepare() net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid pidfs: register pid in pidfs Link: https://lore.kernel.org/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents b590c92 + 20b70e5 commit 923ea4d

File tree

7 files changed

+192
-86
lines changed

7 files changed

+192
-86
lines changed

fs/pidfs.c

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
768768
{
769769
enum pid_type type;
770770

771-
if (flags & PIDFD_CLONE)
771+
if (flags & PIDFD_STALE)
772772
return true;
773773

774774
/*
@@ -777,10 +777,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
777777
* pidfd has been allocated perform another check that the pid
778778
* is still alive. If it is exit information is available even
779779
* if the task gets reaped before the pidfd is returned to
780-
* userspace. The only exception is PIDFD_CLONE where no task
781-
* linkage has been established for @pid yet and the kernel is
782-
* in the middle of process creation so there's nothing for
783-
* pidfs to miss.
780+
* userspace. The only exception are indicated by PIDFD_STALE:
781+
*
782+
* (1) The kernel is in the middle of task creation and thus no
783+
* task linkage has been established yet.
784+
* (2) The caller knows @pid has been registered in pidfs at a
785+
* time when the task was still alive.
786+
*
787+
* In both cases exit information will have been reported.
784788
*/
785789
if (flags & PIDFD_THREAD)
786790
type = PIDTYPE_PID;
@@ -874,11 +878,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
874878
int ret;
875879

876880
/*
877-
* Ensure that PIDFD_CLONE can be passed as a flag without
881+
* Ensure that PIDFD_STALE can be passed as a flag without
878882
* overloading other uapi pidfd flags.
879883
*/
880-
BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
881-
BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
884+
BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
885+
BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
882886

883887
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
884888
if (ret < 0)
@@ -887,7 +891,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
887891
if (!pidfs_pid_valid(pid, &path, flags))
888892
return ERR_PTR(-ESRCH);
889893

890-
flags &= ~PIDFD_CLONE;
894+
flags &= ~PIDFD_STALE;
891895
pidfd_file = dentry_open(&path, flags, current_cred());
892896
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
893897
if (!IS_ERR(pidfd_file))
@@ -896,6 +900,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
896900
return pidfd_file;
897901
}
898902

903+
/**
904+
* pidfs_register_pid - register a struct pid in pidfs
905+
* @pid: pid to pin
906+
*
907+
* Register a struct pid in pidfs. Needs to be paired with
908+
* pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
909+
*
910+
* Return: On success zero, on error a negative error code is returned.
911+
*/
912+
int pidfs_register_pid(struct pid *pid)
913+
{
914+
struct path path __free(path_put) = {};
915+
int ret;
916+
917+
might_sleep();
918+
919+
if (!pid)
920+
return 0;
921+
922+
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
923+
if (unlikely(ret))
924+
return ret;
925+
/* Keep the dentry and only put the reference to the mount. */
926+
path.dentry = NULL;
927+
return 0;
928+
}
929+
930+
/**
931+
* pidfs_get_pid - pin a struct pid through pidfs
932+
* @pid: pid to pin
933+
*
934+
* Similar to pidfs_register_pid() but only valid if the caller knows
935+
* there's a reference to the @pid through a dentry already that can't
936+
* go away.
937+
*/
938+
void pidfs_get_pid(struct pid *pid)
939+
{
940+
if (!pid)
941+
return;
942+
WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
943+
}
944+
945+
/**
946+
* pidfs_put_pid - drop a pidfs reference
947+
* @pid: pid to drop
948+
*
949+
* Drop a reference to @pid via pidfs. This is only safe if the
950+
* reference has been taken via pidfs_get_pid().
951+
*/
952+
void pidfs_put_pid(struct pid *pid)
953+
{
954+
might_sleep();
955+
956+
if (!pid)
957+
return;
958+
VFS_WARN_ON_ONCE(!pid->stashed);
959+
dput(pid->stashed);
960+
}
961+
899962
static void pidfs_inode_init_once(void *data)
900963
{
901964
struct pidfs_inode *pi = data;

include/linux/pid.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ struct file;
7777
struct pid *pidfd_pid(const struct file *file);
7878
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
7979
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
80-
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
80+
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file);
8181
void do_notify_pidfd(struct task_struct *task);
8282

8383
static inline struct pid *get_pid(struct pid *pid)

include/linux/pidfs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid);
88
void pidfs_remove_pid(struct pid *pid);
99
void pidfs_exit(struct task_struct *tsk);
1010
extern const struct dentry_operations pidfs_dentry_operations;
11+
int pidfs_register_pid(struct pid *pid);
12+
void pidfs_get_pid(struct pid *pid);
13+
void pidfs_put_pid(struct pid *pid);
1114

1215
#endif /* _LINUX_PID_FS_H */

include/uapi/linux/pidfd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#define PIDFD_THREAD O_EXCL
1313
#ifdef __KERNEL__
1414
#include <linux/sched.h>
15-
#define PIDFD_CLONE CLONE_PIDFD
15+
#define PIDFD_STALE CLONE_PIDFD
1616
#endif
1717

1818
/* Flags for pidfd_send_signal(). */

kernel/fork.c

Lines changed: 29 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2035,55 +2035,11 @@ static inline void rcu_copy_process(struct task_struct *p)
20352035
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
20362036
}
20372037

2038-
/**
2039-
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
2040-
* @pid: the struct pid for which to create a pidfd
2041-
* @flags: flags of the new @pidfd
2042-
* @ret: Where to return the file for the pidfd.
2043-
*
2044-
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
2045-
* caller's file descriptor table. The pidfd is reserved but not installed yet.
2046-
*
2047-
* The helper doesn't perform checks on @pid which makes it useful for pidfds
2048-
* created via CLONE_PIDFD where @pid has no task attached when the pidfd and
2049-
* pidfd file are prepared.
2050-
*
2051-
* If this function returns successfully the caller is responsible to either
2052-
* call fd_install() passing the returned pidfd and pidfd file as arguments in
2053-
* order to install the pidfd into its file descriptor table or they must use
2054-
* put_unused_fd() and fput() on the returned pidfd and pidfd file
2055-
* respectively.
2056-
*
2057-
* This function is useful when a pidfd must already be reserved but there
2058-
* might still be points of failure afterwards and the caller wants to ensure
2059-
* that no pidfd is leaked into its file descriptor table.
2060-
*
2061-
* Return: On success, a reserved pidfd is returned from the function and a new
2062-
* pidfd file is returned in the last argument to the function. On
2063-
* error, a negative error code is returned from the function and the
2064-
* last argument remains unchanged.
2065-
*/
2066-
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
2067-
{
2068-
struct file *pidfd_file;
2069-
2070-
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
2071-
if (pidfd < 0)
2072-
return pidfd;
2073-
2074-
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
2075-
if (IS_ERR(pidfd_file))
2076-
return PTR_ERR(pidfd_file);
2077-
2078-
*ret = pidfd_file;
2079-
return take_fd(pidfd);
2080-
}
2081-
20822038
/**
20832039
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
20842040
* @pid: the struct pid for which to create a pidfd
20852041
* @flags: flags of the new @pidfd
2086-
* @ret: Where to return the pidfd.
2042+
* @ret_file: return the new pidfs file
20872043
*
20882044
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
20892045
* caller's file descriptor table. The pidfd is reserved but not installed yet.
@@ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
21062062
* error, a negative error code is returned from the function and the
21072063
* last argument remains unchanged.
21082064
*/
2109-
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
2065+
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
21102066
{
2067+
struct file *pidfs_file;
2068+
21112069
/*
2112-
* While holding the pidfd waitqueue lock removing the task
2113-
* linkage for the thread-group leader pid (PIDTYPE_TGID) isn't
2114-
* possible. Thus, if there's still task linkage for PIDTYPE_PID
2115-
* not having thread-group leader linkage for the pid means it
2116-
* wasn't a thread-group leader in the first place.
2070+
* PIDFD_STALE is only allowed to be passed if the caller knows
2071+
* that @pid is already registered in pidfs and thus
2072+
* PIDFD_INFO_EXIT information is guaranteed to be available.
21172073
*/
2118-
scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
2074+
if (!(flags & PIDFD_STALE)) {
2075+
/*
2076+
* While holding the pidfd waitqueue lock removing the
2077+
* task linkage for the thread-group leader pid
2078+
* (PIDTYPE_TGID) isn't possible. Thus, if there's still
2079+
* task linkage for PIDTYPE_PID not having thread-group
2080+
* leader linkage for the pid means it wasn't a
2081+
* thread-group leader in the first place.
2082+
*/
2083+
guard(spinlock_irq)(&pid->wait_pidfd.lock);
2084+
21192085
/* Task has already been reaped. */
21202086
if (!pid_has_task(pid, PIDTYPE_PID))
21212087
return -ESRCH;
@@ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
21282094
return -ENOENT;
21292095
}
21302096

2131-
return __pidfd_prepare(pid, flags, ret);
2097+
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
2098+
if (pidfd < 0)
2099+
return pidfd;
2100+
2101+
pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
2102+
if (IS_ERR(pidfs_file))
2103+
return PTR_ERR(pidfs_file);
2104+
2105+
*ret_file = pidfs_file;
2106+
return take_fd(pidfd);
21322107
}
21332108

21342109
static void __delayed_free_task(struct rcu_head *rhp)
@@ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process(
24772452
* Note that no task has been attached to @pid yet indicate
24782453
* that via CLONE_PIDFD.
24792454
*/
2480-
retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
2455+
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
24812456
if (retval < 0)
24822457
goto bad_fork_free_pid;
24832458
pidfd = retval;

net/core/sock.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@
148148

149149
#include <linux/ethtool.h>
150150

151+
#include <uapi/linux/pidfd.h>
152+
151153
#include "dev.h"
152154

153155
static DEFINE_MUTEX(proto_list_mutex);
@@ -1879,6 +1881,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
18791881
{
18801882
struct pid *peer_pid;
18811883
struct file *pidfd_file = NULL;
1884+
unsigned int flags = 0;
18821885
int pidfd;
18831886

18841887
if (len > sizeof(pidfd))
@@ -1891,18 +1894,17 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
18911894
if (!peer_pid)
18921895
return -ENODATA;
18931896

1894-
pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1897+
/* The use of PIDFD_STALE requires stashing of struct pid
1898+
* on pidfs with pidfs_register_pid() and only AF_UNIX
1899+
* were prepared for this.
1900+
*/
1901+
if (sk->sk_family == AF_UNIX)
1902+
flags = PIDFD_STALE;
1903+
1904+
pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
18951905
put_pid(peer_pid);
1896-
if (pidfd < 0) {
1897-
/*
1898-
* dbus-broker relies on -EINVAL being returned
1899-
* to indicate ESRCH. Paper over it until this
1900-
* is fixed in userspace.
1901-
*/
1902-
if (pidfd == -ESRCH)
1903-
pidfd = -EINVAL;
1906+
if (pidfd < 0)
19041907
return pidfd;
1905-
}
19061908

19071909
if (copy_to_sockptr(optval, &pidfd, len) ||
19081910
copy_to_sockptr(optlen, &len, sizeof(int))) {

0 commit comments

Comments
 (0)