@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
79
79
static HLIST_HEAD (unmounted ); /* protected by namespace_sem */
80
80
static LIST_HEAD (ex_mountpoints ); /* protected by namespace_sem */
81
81
static DEFINE_RWLOCK (mnt_ns_tree_lock );
82
+ static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO (mnt_ns_tree_seqcount , & mnt_ns_tree_lock );
83
+
82
84
static struct rb_root mnt_ns_tree = RB_ROOT ; /* protected by mnt_ns_tree_lock */
83
85
84
86
struct mount_kattr {
@@ -105,37 +107,48 @@ EXPORT_SYMBOL_GPL(fs_kobj);
105
107
*/
106
108
__cacheline_aligned_in_smp DEFINE_SEQLOCK (mount_lock );
107
109
108
- static int mnt_ns_cmp (u64 seq , const struct mnt_namespace * ns )
109
- {
110
- u64 seq_b = ns -> seq ;
111
-
112
- if (seq < seq_b )
113
- return -1 ;
114
- if (seq > seq_b )
115
- return 1 ;
116
- return 0 ;
117
- }
118
-
119
110
static inline struct mnt_namespace * node_to_mnt_ns (const struct rb_node * node )
120
111
{
121
112
if (!node )
122
113
return NULL ;
123
114
return rb_entry (node , struct mnt_namespace , mnt_ns_tree_node );
124
115
}
125
116
126
- static bool mnt_ns_less (struct rb_node * a , const struct rb_node * b )
117
+ static int mnt_ns_cmp (struct rb_node * a , const struct rb_node * b )
127
118
{
128
119
struct mnt_namespace * ns_a = node_to_mnt_ns (a );
129
120
struct mnt_namespace * ns_b = node_to_mnt_ns (b );
130
121
u64 seq_a = ns_a -> seq ;
122
+ u64 seq_b = ns_b -> seq ;
131
123
132
- return mnt_ns_cmp (seq_a , ns_b ) < 0 ;
124
+ if (seq_a < seq_b )
125
+ return -1 ;
126
+ if (seq_a > seq_b )
127
+ return 1 ;
128
+ return 0 ;
129
+ }
130
+
131
+ static inline void mnt_ns_tree_write_lock (void )
132
+ {
133
+ write_lock (& mnt_ns_tree_lock );
134
+ write_seqcount_begin (& mnt_ns_tree_seqcount );
135
+ }
136
+
137
+ static inline void mnt_ns_tree_write_unlock (void )
138
+ {
139
+ write_seqcount_end (& mnt_ns_tree_seqcount );
140
+ write_unlock (& mnt_ns_tree_lock );
133
141
}
134
142
135
143
static void mnt_ns_tree_add (struct mnt_namespace * ns )
136
144
{
137
- guard (write_lock )(& mnt_ns_tree_lock );
138
- rb_add (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_less );
145
+ struct rb_node * node ;
146
+
147
+ mnt_ns_tree_write_lock ();
148
+ node = rb_find_add_rcu (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_cmp );
149
+ mnt_ns_tree_write_unlock ();
150
+
151
+ WARN_ON_ONCE (node );
139
152
}
140
153
141
154
static void mnt_ns_release (struct mnt_namespace * ns )
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
150
163
}
151
164
DEFINE_FREE (mnt_ns_release , struct mnt_namespace * , if (_T ) mnt_ns_release (_T ))
152
165
166
+ static void mnt_ns_release_rcu (struct rcu_head * rcu )
167
+ {
168
+ mnt_ns_release (container_of (rcu , struct mnt_namespace , mnt_ns_rcu ));
169
+ }
170
+
153
171
static void mnt_ns_tree_remove (struct mnt_namespace * ns )
154
172
{
155
173
/* remove from global mount namespace list */
156
174
if (!is_anon_ns (ns )) {
157
- guard ( write_lock )( & mnt_ns_tree_lock );
175
+ mnt_ns_tree_write_lock ( );
158
176
rb_erase (& ns -> mnt_ns_tree_node , & mnt_ns_tree );
177
+ mnt_ns_tree_write_unlock ();
159
178
}
160
179
161
- mnt_ns_release ( ns );
180
+ call_rcu ( & ns -> mnt_ns_rcu , mnt_ns_release_rcu );
162
181
}
163
182
164
- /*
165
- * Returns the mount namespace which either has the specified id, or has the
166
- * next smallest id afer the specified one.
167
- */
168
- static struct mnt_namespace * mnt_ns_find_id_at (u64 mnt_ns_id )
183
+ static int mnt_ns_find (const void * key , const struct rb_node * node )
169
184
{
170
- struct rb_node * node = mnt_ns_tree . rb_node ;
171
- struct mnt_namespace * ret = NULL ;
185
+ const u64 mnt_ns_id = * ( u64 * ) key ;
186
+ const struct mnt_namespace * ns = node_to_mnt_ns ( node ) ;
172
187
173
- lockdep_assert_held (& mnt_ns_tree_lock );
174
-
175
- while (node ) {
176
- struct mnt_namespace * n = node_to_mnt_ns (node );
177
-
178
- if (mnt_ns_id <= n -> seq ) {
179
- ret = node_to_mnt_ns (node );
180
- if (mnt_ns_id == n -> seq )
181
- break ;
182
- node = node -> rb_left ;
183
- } else {
184
- node = node -> rb_right ;
185
- }
186
- }
187
- return ret ;
188
+ if (mnt_ns_id < ns -> seq )
189
+ return -1 ;
190
+ if (mnt_ns_id > ns -> seq )
191
+ return 1 ;
192
+ return 0 ;
188
193
}
189
194
190
195
/*
@@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
194
199
* namespace the @namespace_sem must first be acquired. If the namespace has
195
200
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
196
201
* see that the mount rbtree of the namespace is empty.
202
+ *
203
+ * Note the lookup is lockless protected by a sequence counter. We only
204
+ * need to guard against false negatives as false positives aren't
205
+ * possible. So if we didn't find a mount namespace and the sequence
206
+ * counter has changed we need to retry. If the sequence counter is
207
+ * still the same we know the search actually failed.
197
208
*/
198
209
static struct mnt_namespace * lookup_mnt_ns (u64 mnt_ns_id )
199
210
{
200
- struct mnt_namespace * ns ;
211
+ struct mnt_namespace * ns ;
212
+ struct rb_node * node ;
213
+ unsigned int seq ;
214
+
215
+ guard (rcu )();
216
+ do {
217
+ seq = read_seqcount_begin (& mnt_ns_tree_seqcount );
218
+ node = rb_find_rcu (& mnt_ns_id , & mnt_ns_tree , mnt_ns_find );
219
+ if (node )
220
+ break ;
221
+ } while (read_seqcount_retry (& mnt_ns_tree_seqcount , seq ));
201
222
202
- guard (read_lock )(& mnt_ns_tree_lock );
203
- ns = mnt_ns_find_id_at (mnt_ns_id );
204
- if (!ns || ns -> seq != mnt_ns_id )
205
- return NULL ;
223
+ if (!node )
224
+ return NULL ;
206
225
207
- refcount_inc (& ns -> passive );
208
- return ns ;
226
+ /*
227
+ * The last reference count is put with RCU delay so we can
228
+ * unconditonally acquire a reference here.
229
+ */
230
+ ns = node_to_mnt_ns (node );
231
+ refcount_inc (& ns -> passive );
232
+ return ns ;
209
233
}
210
234
211
235
static inline void lock_mount_hash (void )
0 commit comments