Skip to content

Commit c7bb042

Browse files
committed
Merge patch series "fs: lockless mntns lookup"
Christian Brauner <[email protected]> says: Currently we take the read lock when looking for a mount namespace to list mounts in. We can make this lockless. The simple search case can just use a sequence counter to detect concurrent changes to the rbtree. For walking the list of mount namespaces sequentially via nsfs we keep a separate rcu list as rb_prev() and rb_next() aren't usable safely with rcu. Since creating mount namespaces is a relatively rare event compared with querying mounts in a foreign mount namespace this is worth it. Once libmount and systemd pick up this mechanism to list mounts in foreign mount namespaces this will be used very frequently. * patches from https://lore.kernel.org/r/[email protected]: samples: add test-list-all-mounts selftests: remove unneeded include selftests: add tests for mntns iteration seltests: move nsfs into filesystems subfolder fs: simplify rwlock to spinlock fs: lockless mntns lookup for nsfs rculist: add list_bidir_{del,prev}_rcu() fs: lockless mntns rbtree lookup fs: add mount namespace to rbtree late mount: remove inlude/nospec.h include Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents bd32073 + 75d0dd1 commit c7bb042

File tree

14 files changed

+540
-80
lines changed

14 files changed

+540
-80
lines changed

fs/mount.h

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@ struct mnt_namespace {
1212
struct user_namespace *user_ns;
1313
struct ucounts *ucounts;
1414
u64 seq; /* Sequence number to prevent loops */
15-
wait_queue_head_t poll;
15+
union {
16+
wait_queue_head_t poll;
17+
struct rcu_head mnt_ns_rcu;
18+
};
1619
u64 event;
1720
unsigned int nr_mounts; /* # of mounts in the namespace */
1821
unsigned int pending_mounts;
1922
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
23+
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
2024
refcount_t passive; /* number references not pinning @mounts */
2125
} __randomize_layout;
2226

@@ -157,15 +161,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
157161
}
158162

159163
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
160-
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
161-
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
162-
{
163-
return __lookup_next_mnt_ns(mntns, false);
164-
}
165-
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
166-
{
167-
return __lookup_next_mnt_ns(mntns, true);
168-
}
164+
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
165+
bool previous);
166+
169167
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
170168
{
171169
return container_of(ns, struct mnt_namespace, ns);

fs/namespace.c

Lines changed: 98 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
#include <linux/fs_context.h>
3333
#include <linux/shmem_fs.h>
3434
#include <linux/mnt_idmapping.h>
35-
#include <linux/nospec.h>
3635

3736
#include "pnode.h"
3837
#include "internal.h"
@@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
7978
static DECLARE_RWSEM(namespace_sem);
8079
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
8180
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
82-
static DEFINE_RWLOCK(mnt_ns_tree_lock);
81+
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
82+
8383
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
84+
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
8485

8586
struct mount_kattr {
8687
unsigned int attr_set;
@@ -106,42 +107,60 @@ EXPORT_SYMBOL_GPL(fs_kobj);
106107
*/
107108
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
108109

109-
static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
110-
{
111-
u64 seq_b = ns->seq;
112-
113-
if (seq < seq_b)
114-
return -1;
115-
if (seq > seq_b)
116-
return 1;
117-
return 0;
118-
}
119-
120110
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
121111
{
122112
if (!node)
123113
return NULL;
124114
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
125115
}
126116

127-
static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
117+
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
128118
{
129119
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
130120
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
131121
u64 seq_a = ns_a->seq;
122+
u64 seq_b = ns_b->seq;
123+
124+
if (seq_a < seq_b)
125+
return -1;
126+
if (seq_a > seq_b)
127+
return 1;
128+
return 0;
129+
}
130+
131+
static inline void mnt_ns_tree_write_lock(void)
132+
{
133+
write_seqlock(&mnt_ns_tree_lock);
134+
}
132135

133-
return mnt_ns_cmp(seq_a, ns_b) < 0;
136+
static inline void mnt_ns_tree_write_unlock(void)
137+
{
138+
write_sequnlock(&mnt_ns_tree_lock);
134139
}
135140

136141
static void mnt_ns_tree_add(struct mnt_namespace *ns)
137142
{
138-
guard(write_lock)(&mnt_ns_tree_lock);
139-
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
143+
struct rb_node *node, *prev;
144+
145+
mnt_ns_tree_write_lock();
146+
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
147+
/*
148+
* If there's no previous entry simply add it after the
149+
* head and if there is add it after the previous entry.
150+
*/
151+
prev = rb_prev(&ns->mnt_ns_tree_node);
152+
if (!prev)
153+
list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
154+
else
155+
list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
156+
mnt_ns_tree_write_unlock();
157+
158+
WARN_ON_ONCE(node);
140159
}
141160

142161
static void mnt_ns_release(struct mnt_namespace *ns)
143162
{
144-
lockdep_assert_not_held(&mnt_ns_tree_lock);
163+
lockdep_assert_not_held(&mnt_ns_tree_lock.lock);
145164

146165
/* keep alive for {list,stat}mount() */
147166
if (refcount_dec_and_test(&ns->passive)) {
@@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
151170
}
152171
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
153172

173+
static void mnt_ns_release_rcu(struct rcu_head *rcu)
174+
{
175+
mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
176+
}
177+
154178
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
155179
{
156180
/* remove from global mount namespace list */
157181
if (!is_anon_ns(ns)) {
158-
guard(write_lock)(&mnt_ns_tree_lock);
182+
mnt_ns_tree_write_lock();
159183
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
184+
list_bidir_del_rcu(&ns->mnt_ns_list);
185+
mnt_ns_tree_write_unlock();
160186
}
161187

162-
mnt_ns_release(ns);
188+
call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
163189
}
164190

165-
/*
166-
* Returns the mount namespace which either has the specified id, or has the
167-
* next smallest id afer the specified one.
168-
*/
169-
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
191+
static int mnt_ns_find(const void *key, const struct rb_node *node)
170192
{
171-
struct rb_node *node = mnt_ns_tree.rb_node;
172-
struct mnt_namespace *ret = NULL;
173-
174-
lockdep_assert_held(&mnt_ns_tree_lock);
193+
const u64 mnt_ns_id = *(u64 *)key;
194+
const struct mnt_namespace *ns = node_to_mnt_ns(node);
175195

176-
while (node) {
177-
struct mnt_namespace *n = node_to_mnt_ns(node);
178-
179-
if (mnt_ns_id <= n->seq) {
180-
ret = node_to_mnt_ns(node);
181-
if (mnt_ns_id == n->seq)
182-
break;
183-
node = node->rb_left;
184-
} else {
185-
node = node->rb_right;
186-
}
187-
}
188-
return ret;
196+
if (mnt_ns_id < ns->seq)
197+
return -1;
198+
if (mnt_ns_id > ns->seq)
199+
return 1;
200+
return 0;
189201
}
190202

191203
/*
@@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
195207
* namespace the @namespace_sem must first be acquired. If the namespace has
196208
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
197209
* see that the mount rbtree of the namespace is empty.
210+
*
211+
* Note the lookup is lockless protected by a sequence counter. We only
212+
* need to guard against false negatives as false positives aren't
213+
* possible. So if we didn't find a mount namespace and the sequence
214+
* counter has changed we need to retry. If the sequence counter is
215+
* still the same we know the search actually failed.
198216
*/
199217
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
200218
{
201-
struct mnt_namespace *ns;
219+
struct mnt_namespace *ns;
220+
struct rb_node *node;
221+
unsigned int seq;
202222

203-
guard(read_lock)(&mnt_ns_tree_lock);
204-
ns = mnt_ns_find_id_at(mnt_ns_id);
205-
if (!ns || ns->seq != mnt_ns_id)
206-
return NULL;
223+
guard(rcu)();
224+
do {
225+
seq = read_seqbegin(&mnt_ns_tree_lock);
226+
node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
227+
if (node)
228+
break;
229+
} while (read_seqretry(&mnt_ns_tree_lock, seq));
207230

208-
refcount_inc(&ns->passive);
209-
return ns;
231+
if (!node)
232+
return NULL;
233+
234+
/*
235+
* The last reference count is put with RCU delay so we can
236+
* unconditonally acquire a reference here.
237+
*/
238+
ns = node_to_mnt_ns(node);
239+
refcount_inc(&ns->passive);
240+
return ns;
210241
}
211242

212243
static inline void lock_mount_hash(void)
@@ -2063,30 +2094,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
20632094
return &mnt->ns;
20642095
}
20652096

2066-
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
2097+
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
20672098
{
2068-
guard(read_lock)(&mnt_ns_tree_lock);
2099+
guard(rcu)();
2100+
20692101
for (;;) {
2070-
struct rb_node *node;
2102+
struct list_head *list;
20712103

20722104
if (previous)
2073-
node = rb_prev(&mntns->mnt_ns_tree_node);
2105+
list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
20742106
else
2075-
node = rb_next(&mntns->mnt_ns_tree_node);
2076-
if (!node)
2107+
list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
2108+
if (list_is_head(list, &mnt_ns_list))
20772109
return ERR_PTR(-ENOENT);
20782110

2079-
mntns = node_to_mnt_ns(node);
2080-
node = &mntns->mnt_ns_tree_node;
2111+
mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
20812112

2113+
/*
2114+
* The last passive reference count is put with RCU
2115+
* delay so accessing the mount namespace is not just
2116+
* safe but all relevant members are still valid.
2117+
*/
20822118
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
20832119
continue;
20842120

20852121
/*
2086-
* Holding mnt_ns_tree_lock prevents the mount namespace from
2087-
* being freed but it may well be on it's deathbed. We want an
2088-
* active reference, not just a passive one here as we're
2089-
* persisting the mount namespace.
2122+
* We need an active reference count as we're persisting
2123+
* the mount namespace and it might already be on its
2124+
* deathbed.
20902125
*/
20912126
if (!refcount_inc_not_zero(&mntns->ns.count))
20922127
continue;
@@ -3903,6 +3938,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
39033938
refcount_set(&new_ns->ns.count, 1);
39043939
refcount_set(&new_ns->passive, 1);
39053940
new_ns->mounts = RB_ROOT;
3941+
INIT_LIST_HEAD(&new_ns->mnt_ns_list);
39063942
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
39073943
init_waitqueue_head(&new_ns->poll);
39083944
new_ns->user_ns = get_user_ns(user_ns);
@@ -3982,14 +4018,14 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
39824018
while (p->mnt.mnt_root != q->mnt.mnt_root)
39834019
p = next_mnt(skip_mnt_tree(p), old);
39844020
}
3985-
mnt_ns_tree_add(new_ns);
39864021
namespace_unlock();
39874022

39884023
if (rootmnt)
39894024
mntput(rootmnt);
39904025
if (pwdmnt)
39914026
mntput(pwdmnt);
39924027

4028+
mnt_ns_tree_add(new_ns);
39934029
return new_ns;
39944030
}
39954031

fs/nsfs.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
274274
if (usize < MNT_NS_INFO_SIZE_VER0)
275275
return -EINVAL;
276276

277-
if (previous)
278-
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
279-
else
280-
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
277+
mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
281278
if (IS_ERR(mnt_ns))
282279
return PTR_ERR(mnt_ns);
283280

include/linux/rculist.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
3030
* way, we must not access it directly
3131
*/
3232
#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
33+
/*
34+
* Return the ->prev pointer of a list_head in an rcu safe way. Don't
35+
* access it directly.
36+
*
37+
* Any list traversed with list_bidir_prev_rcu() must never use
38+
* list_del_rcu(). Doing so will poison the ->prev pointer that
39+
* list_bidir_prev_rcu() relies on, which will result in segfaults.
40+
* To prevent these segfaults, use list_bidir_del_rcu() instead
41+
* of list_del_rcu().
42+
*/
43+
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))
3344

3445
/**
3546
* list_tail_rcu - returns the prev pointer of the head of the list
@@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry)
158169
entry->prev = LIST_POISON2;
159170
}
160171

172+
/**
173+
* list_bidir_del_rcu - deletes entry from list without re-initialization
174+
* @entry: the element to delete from the list.
175+
*
176+
* In contrast to list_del_rcu() doesn't poison the prev pointer thus
177+
* allowing backwards traversal via list_bidir_prev_rcu().
178+
*
179+
* Note: list_empty() on entry does not return true after this because
180+
* the entry is in a special undefined state that permits RCU-based
181+
* lockfree reverse traversal. In particular this means that we can not
182+
* poison the forward and backwards pointers that may still be used for
183+
* walking the list.
184+
*
185+
* The caller must take whatever precautions are necessary (such as
186+
* holding appropriate locks) to avoid racing with another list-mutation
187+
* primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
188+
* this same list. However, it is perfectly legal to run concurrently
189+
* with the _rcu list-traversal primitives, such as
190+
* list_for_each_entry_rcu().
191+
*
192+
* Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
193+
* the same list.
194+
*
195+
* Note that the caller is not permitted to immediately free
196+
* the newly deleted entry. Instead, either synchronize_rcu()
197+
* or call_rcu() must be used to defer freeing until an RCU
198+
* grace period has elapsed.
199+
*/
200+
static inline void list_bidir_del_rcu(struct list_head *entry)
201+
{
202+
__list_del_entry(entry);
203+
}
204+
161205
/**
162206
* hlist_del_init_rcu - deletes entry from hash list with re-initialization
163207
* @n: the element to delete from the hash list.

samples/vfs/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# SPDX-License-Identifier: GPL-2.0-only
22
/test-fsmount
3+
/test-list-all-mounts
34
/test-statx
45
/mountinfo

samples/vfs/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# SPDX-License-Identifier: GPL-2.0-only
2-
userprogs-always-y += test-fsmount test-statx mountinfo
2+
userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts
33

44
userccflags += -I usr/include

0 commit comments

Comments
 (0)