Skip to content

Commit 4368898

Browse files
committed
fs: lockless mntns lookup for nsfs
We already made the rbtree lookup lockless for the simple lookup case. However, walking the list of mount namespaces via nsfs still happens with taking the read lock blocking concurrent additions of new mount namespaces pointlessly. Plus, such additions are rare anyway so allow lockless lookup of the previous and next mount namespace by keeping a separate list. This also allows to make some things simpler in the code. Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Jeff Layton <[email protected]> Suggested-by: Peter Zijlstra <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 67d676b commit 4368898

File tree

3 files changed

+34
-26
lines changed

3 files changed

+34
-26
lines changed

fs/mount.h

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ struct mnt_namespace {
2020
unsigned int nr_mounts; /* # of mounts in the namespace */
2121
unsigned int pending_mounts;
2222
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
23+
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
2324
refcount_t passive; /* number references not pinning @mounts */
2425
} __randomize_layout;
2526

@@ -160,15 +161,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
160161
}
161162

162163
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
163-
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
164-
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
165-
{
166-
return __lookup_next_mnt_ns(mntns, false);
167-
}
168-
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
169-
{
170-
return __lookup_next_mnt_ns(mntns, true);
171-
}
164+
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
165+
bool previous);
166+
172167
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
173168
{
174169
return container_of(ns, struct mnt_namespace, ns);

fs/namespace.c

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ static DEFINE_RWLOCK(mnt_ns_tree_lock);
8282
static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
8383

8484
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
85+
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
8586

8687
struct mount_kattr {
8788
unsigned int attr_set;
@@ -142,10 +143,19 @@ static inline void mnt_ns_tree_write_unlock(void)
142143

143144
static void mnt_ns_tree_add(struct mnt_namespace *ns)
144145
{
145-
struct rb_node *node;
146+
struct rb_node *node, *prev;
146147

147148
mnt_ns_tree_write_lock();
148149
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
150+
/*
151+
* If there's no previous entry simply add it after the
152+
* head and if there is add it after the previous entry.
153+
*/
154+
prev = rb_prev(&ns->mnt_ns_tree_node);
155+
if (!prev)
156+
list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
157+
else
158+
list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
149159
mnt_ns_tree_write_unlock();
150160

151161
WARN_ON_ONCE(node);
@@ -174,6 +184,7 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns)
174184
if (!is_anon_ns(ns)) {
175185
mnt_ns_tree_write_lock();
176186
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
187+
list_bidir_del_rcu(&ns->mnt_ns_list);
177188
mnt_ns_tree_write_unlock();
178189
}
179190

@@ -2086,30 +2097,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
20862097
return &mnt->ns;
20872098
}
20882099

2089-
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
2100+
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
20902101
{
2091-
guard(read_lock)(&mnt_ns_tree_lock);
2102+
guard(rcu)();
2103+
20922104
for (;;) {
2093-
struct rb_node *node;
2105+
struct list_head *list;
20942106

20952107
if (previous)
2096-
node = rb_prev(&mntns->mnt_ns_tree_node);
2108+
list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
20972109
else
2098-
node = rb_next(&mntns->mnt_ns_tree_node);
2099-
if (!node)
2110+
list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
2111+
if (list_is_head(list, &mnt_ns_list))
21002112
return ERR_PTR(-ENOENT);
21012113

2102-
mntns = node_to_mnt_ns(node);
2103-
node = &mntns->mnt_ns_tree_node;
2114+
mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
21042115

2116+
/*
2117+
* The last passive reference count is put with RCU
2118+
* delay so accessing the mount namespace is not just
2119+
* safe but all relevant members are still valid.
2120+
*/
21052121
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
21062122
continue;
21072123

21082124
/*
2109-
* Holding mnt_ns_tree_lock prevents the mount namespace from
2110-
* being freed but it may well be on it's deathbed. We want an
2111-
* active reference, not just a passive one here as we're
2112-
* persisting the mount namespace.
2125+
* We need an active reference count as we're persisting
2126+
* the mount namespace and it might already be on its
2127+
* deathbed.
21132128
*/
21142129
if (!refcount_inc_not_zero(&mntns->ns.count))
21152130
continue;
@@ -3926,6 +3941,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
39263941
refcount_set(&new_ns->ns.count, 1);
39273942
refcount_set(&new_ns->passive, 1);
39283943
new_ns->mounts = RB_ROOT;
3944+
INIT_LIST_HEAD(&new_ns->mnt_ns_list);
39293945
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
39303946
init_waitqueue_head(&new_ns->poll);
39313947
new_ns->user_ns = get_user_ns(user_ns);

fs/nsfs.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
274274
if (usize < MNT_NS_INFO_SIZE_VER0)
275275
return -EINVAL;
276276

277-
if (previous)
278-
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
279-
else
280-
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
277+
mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
281278
if (IS_ERR(mnt_ns))
282279
return PTR_ERR(mnt_ns);
283280

0 commit comments

Comments
 (0)