Skip to content

Commit fd0a109

Browse files
committed
net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid
SO_PEERPIDFD currently doesn't support handing out pidfds if the sk->sk_peer_pid thread-group leader has already been reaped. In this case it currently returns EINVAL. Userspace still wants to get a pidfd for a reaped process to have a stable handle it can pass on. This is especially useful now that it is possible to retrieve exit information through a pidfd via the PIDFD_GET_INFO ioctl()'s PIDFD_INFO_EXIT flag. Another summary has been provided by David in [1]: > A pidfd can outlive the task it refers to, and thus user-space must > already be prepared that the task underlying a pidfd is gone at the time > they get their hands on the pidfd. For instance, resolving the pidfd to > a PID via the fdinfo must be prepared to read `-1`. > > Despite user-space knowing that a pidfd might be stale, several kernel > APIs currently add another layer that checks for this. In particular, > SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped, > but returns a stale pidfd if the task is reaped immediately after the > respective alive-check. > > This has the unfortunate effect that user-space now has two ways to > check for the exact same scenario: A syscall might return > EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no > particular reason to distinguish both cases. This also propagates > through user-space APIs, which pass on pidfds. They must be prepared to > pass on `-1` *or* the pidfd, because there is no guaranteed way to get a > stale pidfd from the kernel. > Userspace must already deal with a pidfd referring to a reaped task as > the task may exit and get reaped at any time will there are still many > pidfds referring to it. In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure that PIDFD_INFO_EXIT information is available whenever a pidfd for a reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped pidfds are only handed out if it is guaranteed that the caller sees the exit information: TEST_F(pidfd_info, success_reaped) { struct pidfd_info info = { .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, }; /* * Process has already been reaped and PIDFD_INFO_EXIT been set. * Verify that we can retrieve the exit status of the process. */ ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); ASSERT_TRUE(WIFEXITED(info.exit_code)); ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); } To hand out pidfds for reaped processes we thus allocate a pidfs entry for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is stashed and drop it when the socket is destroyed. This guarantees that exit information will always be recorded for the sk->sk_peer_pid task and we can hand out pidfds for reaped processes. Link: https://lore.kernel.org/lkml/[email protected] [1] Link: https://lore.kernel.org/[email protected] Reviewed-by: David Rheinsberg <[email protected]> Reviewed-by: Kuniyuki Iwashima <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 4770584 commit fd0a109

File tree

1 file changed

+74
-11
lines changed

1 file changed

+74
-11
lines changed

net/unix/af_unix.c

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
#include <linux/splice.h>
101101
#include <linux/string.h>
102102
#include <linux/uaccess.h>
103+
#include <linux/pidfs.h>
103104
#include <net/af_unix.h>
104105
#include <net/net_namespace.h>
105106
#include <net/scm.h>
@@ -643,6 +644,9 @@ static void unix_sock_destructor(struct sock *sk)
643644
return;
644645
}
645646

647+
if (sk->sk_peer_pid)
648+
pidfs_put_pid(sk->sk_peer_pid);
649+
646650
if (u->addr)
647651
unix_release_addr(u->addr);
648652

@@ -734,25 +738,60 @@ static void unix_release_sock(struct sock *sk, int embrion)
734738
unix_gc(); /* Garbage collect fds */
735739
}
736740

737-
static void init_peercred(struct sock *sk)
741+
struct unix_peercred {
742+
struct pid *peer_pid;
743+
const struct cred *peer_cred;
744+
};
745+
746+
static inline int prepare_peercred(struct unix_peercred *peercred)
738747
{
739-
sk->sk_peer_pid = get_pid(task_tgid(current));
740-
sk->sk_peer_cred = get_current_cred();
748+
struct pid *pid;
749+
int err;
750+
751+
pid = task_tgid(current);
752+
err = pidfs_register_pid(pid);
753+
if (likely(!err)) {
754+
peercred->peer_pid = get_pid(pid);
755+
peercred->peer_cred = get_current_cred();
756+
}
757+
return err;
741758
}
742759

743-
static void update_peercred(struct sock *sk)
760+
static void drop_peercred(struct unix_peercred *peercred)
761+
{
762+
const struct cred *cred = NULL;
763+
struct pid *pid = NULL;
764+
765+
might_sleep();
766+
767+
swap(peercred->peer_pid, pid);
768+
swap(peercred->peer_cred, cred);
769+
770+
pidfs_put_pid(pid);
771+
put_pid(pid);
772+
put_cred(cred);
773+
}
774+
775+
static inline void init_peercred(struct sock *sk,
776+
const struct unix_peercred *peercred)
777+
{
778+
sk->sk_peer_pid = peercred->peer_pid;
779+
sk->sk_peer_cred = peercred->peer_cred;
780+
}
781+
782+
static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
744783
{
745784
const struct cred *old_cred;
746785
struct pid *old_pid;
747786

748787
spin_lock(&sk->sk_peer_lock);
749788
old_pid = sk->sk_peer_pid;
750789
old_cred = sk->sk_peer_cred;
751-
init_peercred(sk);
790+
init_peercred(sk, peercred);
752791
spin_unlock(&sk->sk_peer_lock);
753792

754-
put_pid(old_pid);
755-
put_cred(old_cred);
793+
peercred->peer_pid = old_pid;
794+
peercred->peer_cred = old_cred;
756795
}
757796

758797
static void copy_peercred(struct sock *sk, struct sock *peersk)
@@ -761,6 +800,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk)
761800

762801
spin_lock(&sk->sk_peer_lock);
763802
sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
803+
pidfs_get_pid(sk->sk_peer_pid);
764804
sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
765805
spin_unlock(&sk->sk_peer_lock);
766806
}
@@ -770,13 +810,17 @@ static int unix_listen(struct socket *sock, int backlog)
770810
int err;
771811
struct sock *sk = sock->sk;
772812
struct unix_sock *u = unix_sk(sk);
813+
struct unix_peercred peercred = {};
773814

774815
err = -EOPNOTSUPP;
775816
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
776817
goto out; /* Only stream/seqpacket sockets accept */
777818
err = -EINVAL;
778819
if (!READ_ONCE(u->addr))
779820
goto out; /* No listens on an unbound socket */
821+
err = prepare_peercred(&peercred);
822+
if (err)
823+
goto out;
780824
unix_state_lock(sk);
781825
if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
782826
goto out_unlock;
@@ -786,11 +830,12 @@ static int unix_listen(struct socket *sock, int backlog)
786830
WRITE_ONCE(sk->sk_state, TCP_LISTEN);
787831

788832
/* set credentials so connect can copy them */
789-
update_peercred(sk);
833+
update_peercred(sk, &peercred);
790834
err = 0;
791835

792836
out_unlock:
793837
unix_state_unlock(sk);
838+
drop_peercred(&peercred);
794839
out:
795840
return err;
796841
}
@@ -1525,6 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
15251570
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
15261571
struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
15271572
struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1573+
struct unix_peercred peercred = {};
15281574
struct net *net = sock_net(sk);
15291575
struct sk_buff *skb = NULL;
15301576
unsigned char state;
@@ -1561,6 +1607,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
15611607
goto out;
15621608
}
15631609

1610+
err = prepare_peercred(&peercred);
1611+
if (err)
1612+
goto out;
1613+
15641614
/* Allocate skb for sending to listening sock */
15651615
skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
15661616
if (!skb) {
@@ -1636,7 +1686,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
16361686
unix_peer(newsk) = sk;
16371687
newsk->sk_state = TCP_ESTABLISHED;
16381688
newsk->sk_type = sk->sk_type;
1639-
init_peercred(newsk);
1689+
init_peercred(newsk, &peercred);
16401690
newu = unix_sk(newsk);
16411691
newu->listener = other;
16421692
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
@@ -1695,20 +1745,33 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
16951745
out_free_sk:
16961746
unix_release_sock(newsk, 0);
16971747
out:
1748+
drop_peercred(&peercred);
16981749
return err;
16991750
}
17001751

17011752
static int unix_socketpair(struct socket *socka, struct socket *sockb)
17021753
{
1754+
struct unix_peercred ska_peercred = {}, skb_peercred = {};
17031755
struct sock *ska = socka->sk, *skb = sockb->sk;
1756+
int err;
1757+
1758+
err = prepare_peercred(&ska_peercred);
1759+
if (err)
1760+
return err;
1761+
1762+
err = prepare_peercred(&skb_peercred);
1763+
if (err) {
1764+
drop_peercred(&ska_peercred);
1765+
return err;
1766+
}
17041767

17051768
/* Join our sockets back to back */
17061769
sock_hold(ska);
17071770
sock_hold(skb);
17081771
unix_peer(ska) = skb;
17091772
unix_peer(skb) = ska;
1710-
init_peercred(ska);
1711-
init_peercred(skb);
1773+
init_peercred(ska, &ska_peercred);
1774+
init_peercred(skb, &skb_peercred);
17121775

17131776
ska->sk_state = TCP_ESTABLISHED;
17141777
skb->sk_state = TCP_ESTABLISHED;

0 commit comments

Comments
 (0)