32
32
#include <linux/fs_context.h>
33
33
#include <linux/shmem_fs.h>
34
34
#include <linux/mnt_idmapping.h>
35
- #include <linux/nospec.h>
36
35
37
36
#include "pnode.h"
38
37
#include "internal.h"
@@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
79
78
static DECLARE_RWSEM (namespace_sem );
80
79
static HLIST_HEAD (unmounted ); /* protected by namespace_sem */
81
80
static LIST_HEAD (ex_mountpoints ); /* protected by namespace_sem */
82
- static DEFINE_RWLOCK (mnt_ns_tree_lock );
81
+ static DEFINE_SEQLOCK (mnt_ns_tree_lock );
82
+
83
83
static struct rb_root mnt_ns_tree = RB_ROOT ; /* protected by mnt_ns_tree_lock */
84
+ static LIST_HEAD (mnt_ns_list ); /* protected by mnt_ns_tree_lock */
84
85
85
86
struct mount_kattr {
86
87
unsigned int attr_set ;
@@ -106,42 +107,60 @@ EXPORT_SYMBOL_GPL(fs_kobj);
106
107
*/
107
108
__cacheline_aligned_in_smp DEFINE_SEQLOCK (mount_lock );
108
109
109
- static int mnt_ns_cmp (u64 seq , const struct mnt_namespace * ns )
110
- {
111
- u64 seq_b = ns -> seq ;
112
-
113
- if (seq < seq_b )
114
- return -1 ;
115
- if (seq > seq_b )
116
- return 1 ;
117
- return 0 ;
118
- }
119
-
120
110
static inline struct mnt_namespace * node_to_mnt_ns (const struct rb_node * node )
121
111
{
122
112
if (!node )
123
113
return NULL ;
124
114
return rb_entry (node , struct mnt_namespace , mnt_ns_tree_node );
125
115
}
126
116
127
- static bool mnt_ns_less (struct rb_node * a , const struct rb_node * b )
117
+ static int mnt_ns_cmp (struct rb_node * a , const struct rb_node * b )
128
118
{
129
119
struct mnt_namespace * ns_a = node_to_mnt_ns (a );
130
120
struct mnt_namespace * ns_b = node_to_mnt_ns (b );
131
121
u64 seq_a = ns_a -> seq ;
122
+ u64 seq_b = ns_b -> seq ;
123
+
124
+ if (seq_a < seq_b )
125
+ return -1 ;
126
+ if (seq_a > seq_b )
127
+ return 1 ;
128
+ return 0 ;
129
+ }
130
+
131
+ static inline void mnt_ns_tree_write_lock (void )
132
+ {
133
+ write_seqlock (& mnt_ns_tree_lock );
134
+ }
132
135
133
- return mnt_ns_cmp (seq_a , ns_b ) < 0 ;
136
+ static inline void mnt_ns_tree_write_unlock (void )
137
+ {
138
+ write_sequnlock (& mnt_ns_tree_lock );
134
139
}
135
140
136
141
static void mnt_ns_tree_add (struct mnt_namespace * ns )
137
142
{
138
- guard (write_lock )(& mnt_ns_tree_lock );
139
- rb_add (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_less );
143
+ struct rb_node * node , * prev ;
144
+
145
+ mnt_ns_tree_write_lock ();
146
+ node = rb_find_add_rcu (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_cmp );
147
+ /*
148
+ * If there's no previous entry simply add it after the
149
+ * head and if there is add it after the previous entry.
150
+ */
151
+ prev = rb_prev (& ns -> mnt_ns_tree_node );
152
+ if (!prev )
153
+ list_add_rcu (& ns -> mnt_ns_list , & mnt_ns_list );
154
+ else
155
+ list_add_rcu (& ns -> mnt_ns_list , & node_to_mnt_ns (prev )-> mnt_ns_list );
156
+ mnt_ns_tree_write_unlock ();
157
+
158
+ WARN_ON_ONCE (node );
140
159
}
141
160
142
161
static void mnt_ns_release (struct mnt_namespace * ns )
143
162
{
144
- lockdep_assert_not_held (& mnt_ns_tree_lock );
163
+ lockdep_assert_not_held (& mnt_ns_tree_lock . lock );
145
164
146
165
/* keep alive for {list,stat}mount() */
147
166
if (refcount_dec_and_test (& ns -> passive )) {
@@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
151
170
}
152
171
DEFINE_FREE (mnt_ns_release , struct mnt_namespace * , if (_T ) mnt_ns_release (_T ))
153
172
173
+ static void mnt_ns_release_rcu (struct rcu_head * rcu )
174
+ {
175
+ mnt_ns_release (container_of (rcu , struct mnt_namespace , mnt_ns_rcu ));
176
+ }
177
+
154
178
static void mnt_ns_tree_remove (struct mnt_namespace * ns )
155
179
{
156
180
/* remove from global mount namespace list */
157
181
if (!is_anon_ns (ns )) {
158
- guard ( write_lock )( & mnt_ns_tree_lock );
182
+ mnt_ns_tree_write_lock ( );
159
183
rb_erase (& ns -> mnt_ns_tree_node , & mnt_ns_tree );
184
+ list_bidir_del_rcu (& ns -> mnt_ns_list );
185
+ mnt_ns_tree_write_unlock ();
160
186
}
161
187
162
- mnt_ns_release ( ns );
188
+ call_rcu ( & ns -> mnt_ns_rcu , mnt_ns_release_rcu );
163
189
}
164
190
165
- /*
166
- * Returns the mount namespace which either has the specified id, or has the
167
- * next smallest id afer the specified one.
168
- */
169
- static struct mnt_namespace * mnt_ns_find_id_at (u64 mnt_ns_id )
191
+ static int mnt_ns_find (const void * key , const struct rb_node * node )
170
192
{
171
- struct rb_node * node = mnt_ns_tree .rb_node ;
172
- struct mnt_namespace * ret = NULL ;
173
-
174
- lockdep_assert_held (& mnt_ns_tree_lock );
193
+ const u64 mnt_ns_id = * (u64 * )key ;
194
+ const struct mnt_namespace * ns = node_to_mnt_ns (node );
175
195
176
- while (node ) {
177
- struct mnt_namespace * n = node_to_mnt_ns (node );
178
-
179
- if (mnt_ns_id <= n -> seq ) {
180
- ret = node_to_mnt_ns (node );
181
- if (mnt_ns_id == n -> seq )
182
- break ;
183
- node = node -> rb_left ;
184
- } else {
185
- node = node -> rb_right ;
186
- }
187
- }
188
- return ret ;
196
+ if (mnt_ns_id < ns -> seq )
197
+ return -1 ;
198
+ if (mnt_ns_id > ns -> seq )
199
+ return 1 ;
200
+ return 0 ;
189
201
}
190
202
191
203
/*
@@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
195
207
* namespace the @namespace_sem must first be acquired. If the namespace has
196
208
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
197
209
* see that the mount rbtree of the namespace is empty.
210
+ *
211
+ * Note the lookup is lockless protected by a sequence counter. We only
212
+ * need to guard against false negatives as false positives aren't
213
+ * possible. So if we didn't find a mount namespace and the sequence
214
+ * counter has changed we need to retry. If the sequence counter is
215
+ * still the same we know the search actually failed.
198
216
*/
199
217
static struct mnt_namespace * lookup_mnt_ns (u64 mnt_ns_id )
200
218
{
201
- struct mnt_namespace * ns ;
219
+ struct mnt_namespace * ns ;
220
+ struct rb_node * node ;
221
+ unsigned int seq ;
202
222
203
- guard (read_lock )(& mnt_ns_tree_lock );
204
- ns = mnt_ns_find_id_at (mnt_ns_id );
205
- if (!ns || ns -> seq != mnt_ns_id )
206
- return NULL ;
223
+ guard (rcu )();
224
+ do {
225
+ seq = read_seqbegin (& mnt_ns_tree_lock );
226
+ node = rb_find_rcu (& mnt_ns_id , & mnt_ns_tree , mnt_ns_find );
227
+ if (node )
228
+ break ;
229
+ } while (read_seqretry (& mnt_ns_tree_lock , seq ));
207
230
208
- refcount_inc (& ns -> passive );
209
- return ns ;
231
+ if (!node )
232
+ return NULL ;
233
+
234
+ /*
235
+ * The last reference count is put with RCU delay so we can
236
+ * unconditonally acquire a reference here.
237
+ */
238
+ ns = node_to_mnt_ns (node );
239
+ refcount_inc (& ns -> passive );
240
+ return ns ;
210
241
}
211
242
212
243
static inline void lock_mount_hash (void )
@@ -2063,30 +2094,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
2063
2094
return & mnt -> ns ;
2064
2095
}
2065
2096
2066
- struct mnt_namespace * __lookup_next_mnt_ns (struct mnt_namespace * mntns , bool previous )
2097
+ struct mnt_namespace * get_sequential_mnt_ns (struct mnt_namespace * mntns , bool previous )
2067
2098
{
2068
- guard (read_lock )(& mnt_ns_tree_lock );
2099
+ guard (rcu )();
2100
+
2069
2101
for (;;) {
2070
- struct rb_node * node ;
2102
+ struct list_head * list ;
2071
2103
2072
2104
if (previous )
2073
- node = rb_prev ( & mntns -> mnt_ns_tree_node );
2105
+ list = rcu_dereference ( list_bidir_prev_rcu ( & mntns -> mnt_ns_list ) );
2074
2106
else
2075
- node = rb_next ( & mntns -> mnt_ns_tree_node );
2076
- if (! node )
2107
+ list = rcu_dereference ( list_next_rcu ( & mntns -> mnt_ns_list ) );
2108
+ if (list_is_head ( list , & mnt_ns_list ) )
2077
2109
return ERR_PTR (- ENOENT );
2078
2110
2079
- mntns = node_to_mnt_ns (node );
2080
- node = & mntns -> mnt_ns_tree_node ;
2111
+ mntns = list_entry_rcu (list , struct mnt_namespace , mnt_ns_list );
2081
2112
2113
+ /*
2114
+ * The last passive reference count is put with RCU
2115
+ * delay so accessing the mount namespace is not just
2116
+ * safe but all relevant members are still valid.
2117
+ */
2082
2118
if (!ns_capable_noaudit (mntns -> user_ns , CAP_SYS_ADMIN ))
2083
2119
continue ;
2084
2120
2085
2121
/*
2086
- * Holding mnt_ns_tree_lock prevents the mount namespace from
2087
- * being freed but it may well be on it's deathbed. We want an
2088
- * active reference, not just a passive one here as we're
2089
- * persisting the mount namespace.
2122
+ * We need an active reference count as we're persisting
2123
+ * the mount namespace and it might already be on its
2124
+ * deathbed.
2090
2125
*/
2091
2126
if (!refcount_inc_not_zero (& mntns -> ns .count ))
2092
2127
continue ;
@@ -3903,6 +3938,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
3903
3938
refcount_set (& new_ns -> ns .count , 1 );
3904
3939
refcount_set (& new_ns -> passive , 1 );
3905
3940
new_ns -> mounts = RB_ROOT ;
3941
+ INIT_LIST_HEAD (& new_ns -> mnt_ns_list );
3906
3942
RB_CLEAR_NODE (& new_ns -> mnt_ns_tree_node );
3907
3943
init_waitqueue_head (& new_ns -> poll );
3908
3944
new_ns -> user_ns = get_user_ns (user_ns );
@@ -3982,14 +4018,14 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
3982
4018
while (p -> mnt .mnt_root != q -> mnt .mnt_root )
3983
4019
p = next_mnt (skip_mnt_tree (p ), old );
3984
4020
}
3985
- mnt_ns_tree_add (new_ns );
3986
4021
namespace_unlock ();
3987
4022
3988
4023
if (rootmnt )
3989
4024
mntput (rootmnt );
3990
4025
if (pwdmnt )
3991
4026
mntput (pwdmnt );
3992
4027
4028
+ mnt_ns_tree_add (new_ns );
3993
4029
return new_ns ;
3994
4030
}
3995
4031
0 commit comments