Skip to content

Commit 16ecd47

Browse files
committed
pidfs: lookup pid through rbtree
The new pid inode number allocation scheme is neat but I overlooked a possible, even though unlikely, attack that can be used to trigger an overflow on both 32bit and 64bit. An unique 64 bit identifier was constructed for each struct pid by two combining a 32 bit idr with a 32 bit generation number. A 32bit number was allocated using the idr_alloc_cyclic() infrastructure. When the idr wrapped around a 32 bit wraparound counter was incremented. The 32 bit wraparound counter served as the upper 32 bits and the allocated idr number as the lower 32 bits. Since the idr can only allocate up to INT_MAX entries everytime a wraparound happens INT_MAX - 1 entries are lost (Ignoring that numbering always starts at 2 to avoid theoretical collisions with the root inode number.). If userspace fully populates the idr such that and puts itself into control of two entries such that one entry is somewhere in the middle and the other entry is the INT_MAX entry then it is possible to overflow the wraparound counter. That is probably difficult to pull off but the mere possibility is annoying. The problem could be contained to 32 bit by switching to a data structure such as the maple tree that allows allocating 64 bit numbers on 64 bit machines. That would leave 32 bit in a lurch but that probably doesn't matter that much. The other problem is that removing entries form the maple tree is somewhat non-trivial because the removal code can be called under the irq write lock of tasklist_lock and irq{save,restore} code. Instead, allocate unique identifiers for struct pid by simply incrementing a 64 bit counter and insert each struct pid into the rbtree so it can be looked up to decode file handles avoiding to leak actual pids across pid namespaces in file handles. On both 64 bit and 32 bit the same 64 bit identifier is used to lookup struct pid in the rbtree. On 64 bit the unique identifier for struct pid simply becomes the inode number. Comparing two pidfds continues to be as simple as comparing inode numbers. On 32 bit the 64 bit number assigned to struct pid is split into two 32 bit numbers. The lower 32 bits are used as the inode number and the upper 32 bits are used as the inode generation number. Whenever a wraparound happens on 32 bit the 64 bit number will be incremented by 2 so inode numbering starts at 2 again. When a wraparound happens on 32 bit multiple pidfds with the same inode number are likely to exist. This isn't a problem since before pidfs pidfds used the anonymous inode meaning all pidfds had the same inode number. On 32 bit sserspace can thus reconstruct the 64 bit identifier by retrieving both the inode number and the inode generation number to compare, or use file handles. This gives the same guarantees on both 32 bit and 64 bit. Link: https://lore.kernel.org/r/20241214-gekoppelt-erdarbeiten-a1f9a982a5a6@brauner Signed-off-by: Christian Brauner <[email protected]>
1 parent 59a42b0 commit 16ecd47

File tree

4 files changed

+86
-53
lines changed

4 files changed

+86
-53
lines changed

fs/pidfs.c

Lines changed: 80 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,9 @@
2424
#include "internal.h"
2525
#include "mount.h"
2626

27-
static DEFINE_IDR(pidfs_ino_idr);
28-
29-
static u32 pidfs_ino_upper_32_bits = 0;
27+
static struct rb_root pidfs_ino_tree = RB_ROOT;
3028

3129
#if BITS_PER_LONG == 32
32-
/*
33-
* On 32 bit systems the lower 32 bits are the inode number and
34-
* the higher 32 bits are the generation number. The starting
35-
* value for the inode number and the generation number is one.
36-
*/
37-
static u32 pidfs_ino_lower_32_bits = 1;
38-
3930
static inline unsigned long pidfs_ino(u64 ino)
4031
{
4132
return lower_32_bits(ino);
@@ -49,52 +40,79 @@ static inline u32 pidfs_gen(u64 ino)
4940

5041
#else
5142

52-
static u32 pidfs_ino_lower_32_bits = 0;
53-
5443
/* On 64 bit simply return ino. */
5544
static inline unsigned long pidfs_ino(u64 ino)
5645
{
5746
return ino;
5847
}
5948

60-
/* On 64 bit the generation number is 1. */
49+
/* On 64 bit the generation number is 0. */
6150
static inline u32 pidfs_gen(u64 ino)
6251
{
63-
return 1;
52+
return 0;
6453
}
6554
#endif
6655

67-
/*
68-
* Construct an inode number for struct pid in a way that we can use the
69-
* lower 32bit to lookup struct pid independent of any pid numbers that
70-
* could be leaked into userspace (e.g., via file handle encoding).
71-
*/
72-
int pidfs_add_pid(struct pid *pid)
56+
static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
7357
{
74-
u32 upper;
75-
int lower;
76-
77-
/*
78-
* Inode numbering for pidfs start at 2. This avoids collisions
79-
* with the root inode which is 1 for pseudo filesystems.
80-
*/
81-
lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC);
82-
if (lower >= 0 && lower < pidfs_ino_lower_32_bits)
83-
pidfs_ino_upper_32_bits++;
84-
upper = pidfs_ino_upper_32_bits;
85-
pidfs_ino_lower_32_bits = lower;
86-
if (lower < 0)
87-
return lower;
88-
89-
pid->ino = ((u64)upper << 32) | lower;
90-
pid->stashed = NULL;
58+
struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
59+
struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
60+
u64 pid_ino_a = pid_a->ino;
61+
u64 pid_ino_b = pid_b->ino;
62+
63+
if (pid_ino_a < pid_ino_b)
64+
return -1;
65+
if (pid_ino_a > pid_ino_b)
66+
return 1;
9167
return 0;
9268
}
9369

94-
/* The idr number to remove is the lower 32 bits of the inode. */
70+
void pidfs_add_pid(struct pid *pid)
71+
{
72+
static u64 pidfs_ino_nr = 2;
73+
74+
/*
75+
* On 64 bit nothing special happens. The 64bit number assigned
76+
* to struct pid is the inode number.
77+
*
78+
* On 32 bit the 64 bit number assigned to struct pid is split
79+
* into two 32 bit numbers. The lower 32 bits are used as the
80+
* inode number and the upper 32 bits are used as the inode
81+
* generation number.
82+
*
83+
* On 32 bit pidfs_ino() will return the lower 32 bit. When
84+
* pidfs_ino() returns zero a wrap around happened. When a
85+
* wraparound happens the 64 bit number will be incremented by 2
86+
* so inode numbering starts at 2 again.
87+
*
88+
* On 64 bit comparing two pidfds is as simple as comparing
89+
* inode numbers.
90+
*
91+
* When a wraparound happens on 32 bit multiple pidfds with the
92+
* same inode number are likely to exist (This isn't a problem
93+
* since before pidfs pidfds used the anonymous inode meaning
94+
* all pidfds had the same inode number.). Userspace can
95+
* reconstruct the 64 bit identifier by retrieving both the
96+
* inode number and the inode generation number to compare or
97+
* use file handles.
98+
*/
99+
if (pidfs_ino(pidfs_ino_nr) == 0)
100+
pidfs_ino_nr += 2;
101+
102+
pid->ino = pidfs_ino_nr;
103+
pid->stashed = NULL;
104+
pidfs_ino_nr++;
105+
106+
write_seqcount_begin(&pidmap_lock_seq);
107+
rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
108+
write_seqcount_end(&pidmap_lock_seq);
109+
}
110+
95111
void pidfs_remove_pid(struct pid *pid)
96112
{
97-
idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino));
113+
write_seqcount_begin(&pidmap_lock_seq);
114+
rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
115+
write_seqcount_end(&pidmap_lock_seq);
98116
}
99117

100118
#ifdef CONFIG_PROC_FS
@@ -513,24 +531,37 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
513531
return FILEID_KERNFS;
514532
}
515533

534+
static int pidfs_ino_find(const void *key, const struct rb_node *node)
535+
{
536+
const u64 pid_ino = *(u64 *)key;
537+
const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
538+
539+
if (pid_ino < pid->ino)
540+
return -1;
541+
if (pid_ino > pid->ino)
542+
return 1;
543+
return 0;
544+
}
545+
516546
/* Find a struct pid based on the inode number. */
517547
static struct pid *pidfs_ino_get_pid(u64 ino)
518548
{
519-
unsigned long pid_ino = pidfs_ino(ino);
520-
u32 gen = pidfs_gen(ino);
521549
struct pid *pid;
550+
struct rb_node *node;
551+
unsigned int seq;
522552

523553
guard(rcu)();
524-
525-
pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino));
526-
if (!pid)
554+
do {
555+
seq = read_seqcount_begin(&pidmap_lock_seq);
556+
node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
557+
if (node)
558+
break;
559+
} while (read_seqcount_retry(&pidmap_lock_seq, seq));
560+
561+
if (!node)
527562
return NULL;
528563

529-
if (pidfs_ino(pid->ino) != pid_ino)
530-
return NULL;
531-
532-
if (pidfs_gen(pid->ino) != gen)
533-
return NULL;
564+
pid = rb_entry(node, struct pid, pidfs_node);
534565

535566
/* Within our pid namespace hierarchy? */
536567
if (pid_vnr(pid) == 0)

include/linux/pid.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ struct pid
5959
spinlock_t lock;
6060
struct dentry *stashed;
6161
u64 ino;
62+
struct rb_node pidfs_node;
6263
/* lists of tasks that use this pid */
6364
struct hlist_head tasks[PIDTYPE_MAX];
6465
struct hlist_head inodes;
@@ -68,6 +69,7 @@ struct pid
6869
struct upid numbers[];
6970
};
7071

72+
extern seqcount_spinlock_t pidmap_lock_seq;
7173
extern struct pid init_struct_pid;
7274

7375
struct file;

include/linux/pidfs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
66
void __init pidfs_init(void);
7-
int pidfs_add_pid(struct pid *pid);
7+
void pidfs_add_pid(struct pid *pid);
88
void pidfs_remove_pid(struct pid *pid);
99

1010
#endif /* _LINUX_PID_FS_H */

kernel/pid.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include <linux/sched/task.h>
4444
#include <linux/idr.h>
4545
#include <linux/pidfs.h>
46+
#include <linux/seqlock.h>
4647
#include <net/sock.h>
4748
#include <uapi/linux/pidfd.h>
4849

@@ -103,6 +104,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
103104
*/
104105

105106
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
107+
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
106108

107109
void put_pid(struct pid *pid)
108110
{
@@ -273,9 +275,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
273275
spin_lock_irq(&pidmap_lock);
274276
if (!(ns->pid_allocated & PIDNS_ADDING))
275277
goto out_unlock;
276-
retval = pidfs_add_pid(pid);
277-
if (retval)
278-
goto out_unlock;
278+
pidfs_add_pid(pid);
279279
for ( ; upid >= pid->numbers; --upid) {
280280
/* Make the PID visible to find_pid_ns. */
281281
idr_replace(&upid->ns->idr, pid, upid->nr);

0 commit comments

Comments
 (0)