Skip to content

Commit 5dcbd85

Browse files
committed
fs: lockless mntns rbtree lookup
Currently we use a read-write lock but for the simple search case we can make this lockless. Creating a new mount namespace is a rather rare event compared with querying mounts in a foreign mount namespace. Once this is picked up by e.g., systemd to list mounts in another mount in it's isolated services or in containers this will be used a lot so this seems worthwhile doing. Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Jeff Layton <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 144acef commit 5dcbd85

File tree

2 files changed

+74
-47
lines changed

2 files changed

+74
-47
lines changed

fs/mount.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ struct mnt_namespace {
1212
struct user_namespace *user_ns;
1313
struct ucounts *ucounts;
1414
u64 seq; /* Sequence number to prevent loops */
15-
wait_queue_head_t poll;
15+
union {
16+
wait_queue_head_t poll;
17+
struct rcu_head mnt_ns_rcu;
18+
};
1619
u64 event;
1720
unsigned int nr_mounts; /* # of mounts in the namespace */
1821
unsigned int pending_mounts;

fs/namespace.c

Lines changed: 70 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
7979
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
8080
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
8181
static DEFINE_RWLOCK(mnt_ns_tree_lock);
82+
static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
83+
8284
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
8385

8486
struct mount_kattr {
@@ -105,37 +107,48 @@ EXPORT_SYMBOL_GPL(fs_kobj);
105107
*/
106108
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
107109

108-
static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
109-
{
110-
u64 seq_b = ns->seq;
111-
112-
if (seq < seq_b)
113-
return -1;
114-
if (seq > seq_b)
115-
return 1;
116-
return 0;
117-
}
118-
119110
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
120111
{
121112
if (!node)
122113
return NULL;
123114
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
124115
}
125116

126-
static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
117+
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
127118
{
128119
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
129120
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
130121
u64 seq_a = ns_a->seq;
122+
u64 seq_b = ns_b->seq;
131123

132-
return mnt_ns_cmp(seq_a, ns_b) < 0;
124+
if (seq_a < seq_b)
125+
return -1;
126+
if (seq_a > seq_b)
127+
return 1;
128+
return 0;
129+
}
130+
131+
static inline void mnt_ns_tree_write_lock(void)
132+
{
133+
write_lock(&mnt_ns_tree_lock);
134+
write_seqcount_begin(&mnt_ns_tree_seqcount);
135+
}
136+
137+
static inline void mnt_ns_tree_write_unlock(void)
138+
{
139+
write_seqcount_end(&mnt_ns_tree_seqcount);
140+
write_unlock(&mnt_ns_tree_lock);
133141
}
134142

135143
static void mnt_ns_tree_add(struct mnt_namespace *ns)
136144
{
137-
guard(write_lock)(&mnt_ns_tree_lock);
138-
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
145+
struct rb_node *node;
146+
147+
mnt_ns_tree_write_lock();
148+
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
149+
mnt_ns_tree_write_unlock();
150+
151+
WARN_ON_ONCE(node);
139152
}
140153

141154
static void mnt_ns_release(struct mnt_namespace *ns)
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
150163
}
151164
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
152165

166+
static void mnt_ns_release_rcu(struct rcu_head *rcu)
167+
{
168+
mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
169+
}
170+
153171
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
154172
{
155173
/* remove from global mount namespace list */
156174
if (!is_anon_ns(ns)) {
157-
guard(write_lock)(&mnt_ns_tree_lock);
175+
mnt_ns_tree_write_lock();
158176
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
177+
mnt_ns_tree_write_unlock();
159178
}
160179

161-
mnt_ns_release(ns);
180+
call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
162181
}
163182

164-
/*
165-
* Returns the mount namespace which either has the specified id, or has the
166-
* next smallest id afer the specified one.
167-
*/
168-
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
183+
static int mnt_ns_find(const void *key, const struct rb_node *node)
169184
{
170-
struct rb_node *node = mnt_ns_tree.rb_node;
171-
struct mnt_namespace *ret = NULL;
185+
const u64 mnt_ns_id = *(u64 *)key;
186+
const struct mnt_namespace *ns = node_to_mnt_ns(node);
172187

173-
lockdep_assert_held(&mnt_ns_tree_lock);
174-
175-
while (node) {
176-
struct mnt_namespace *n = node_to_mnt_ns(node);
177-
178-
if (mnt_ns_id <= n->seq) {
179-
ret = node_to_mnt_ns(node);
180-
if (mnt_ns_id == n->seq)
181-
break;
182-
node = node->rb_left;
183-
} else {
184-
node = node->rb_right;
185-
}
186-
}
187-
return ret;
188+
if (mnt_ns_id < ns->seq)
189+
return -1;
190+
if (mnt_ns_id > ns->seq)
191+
return 1;
192+
return 0;
188193
}
189194

190195
/*
@@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
194199
* namespace the @namespace_sem must first be acquired. If the namespace has
195200
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
196201
* see that the mount rbtree of the namespace is empty.
202+
*
203+
* Note the lookup is lockless protected by a sequence counter. We only
204+
* need to guard against false negatives as false positives aren't
205+
* possible. So if we didn't find a mount namespace and the sequence
206+
* counter has changed we need to retry. If the sequence counter is
207+
* still the same we know the search actually failed.
197208
*/
198209
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
199210
{
200-
struct mnt_namespace *ns;
211+
struct mnt_namespace *ns;
212+
struct rb_node *node;
213+
unsigned int seq;
214+
215+
guard(rcu)();
216+
do {
217+
seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
218+
node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
219+
if (node)
220+
break;
221+
} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
201222

202-
guard(read_lock)(&mnt_ns_tree_lock);
203-
ns = mnt_ns_find_id_at(mnt_ns_id);
204-
if (!ns || ns->seq != mnt_ns_id)
205-
return NULL;
223+
if (!node)
224+
return NULL;
206225

207-
refcount_inc(&ns->passive);
208-
return ns;
226+
/*
227+
* The last reference count is put with RCU delay so we can
228+
* unconditonally acquire a reference here.
229+
*/
230+
ns = node_to_mnt_ns(node);
231+
refcount_inc(&ns->passive);
232+
return ns;
209233
}
210234

211235
static inline void lock_mount_hash(void)

0 commit comments

Comments
 (0)