@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
7979static HLIST_HEAD (unmounted ); /* protected by namespace_sem */
8080static LIST_HEAD (ex_mountpoints ); /* protected by namespace_sem */
8181static DEFINE_RWLOCK (mnt_ns_tree_lock );
82+ static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO (mnt_ns_tree_seqcount , & mnt_ns_tree_lock );
83+
8284static struct rb_root mnt_ns_tree = RB_ROOT ; /* protected by mnt_ns_tree_lock */
8385
8486struct mount_kattr {
@@ -105,37 +107,48 @@ EXPORT_SYMBOL_GPL(fs_kobj);
105107 */
106108__cacheline_aligned_in_smp DEFINE_SEQLOCK (mount_lock );
107109
108- static int mnt_ns_cmp (u64 seq , const struct mnt_namespace * ns )
109- {
110- u64 seq_b = ns -> seq ;
111-
112- if (seq < seq_b )
113- return -1 ;
114- if (seq > seq_b )
115- return 1 ;
116- return 0 ;
117- }
118-
119110static inline struct mnt_namespace * node_to_mnt_ns (const struct rb_node * node )
120111{
121112 if (!node )
122113 return NULL ;
123114 return rb_entry (node , struct mnt_namespace , mnt_ns_tree_node );
124115}
125116
126- static bool mnt_ns_less (struct rb_node * a , const struct rb_node * b )
117+ static int mnt_ns_cmp (struct rb_node * a , const struct rb_node * b )
127118{
128119 struct mnt_namespace * ns_a = node_to_mnt_ns (a );
129120 struct mnt_namespace * ns_b = node_to_mnt_ns (b );
130121 u64 seq_a = ns_a -> seq ;
122+ u64 seq_b = ns_b -> seq ;
131123
132- return mnt_ns_cmp (seq_a , ns_b ) < 0 ;
124+ if (seq_a < seq_b )
125+ return -1 ;
126+ if (seq_a > seq_b )
127+ return 1 ;
128+ return 0 ;
129+ }
130+
131+ static inline void mnt_ns_tree_write_lock (void )
132+ {
133+ write_lock (& mnt_ns_tree_lock );
134+ write_seqcount_begin (& mnt_ns_tree_seqcount );
135+ }
136+
137+ static inline void mnt_ns_tree_write_unlock (void )
138+ {
139+ write_seqcount_end (& mnt_ns_tree_seqcount );
140+ write_unlock (& mnt_ns_tree_lock );
133141}
134142
135143static void mnt_ns_tree_add (struct mnt_namespace * ns )
136144{
137- guard (write_lock )(& mnt_ns_tree_lock );
138- rb_add (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_less );
145+ struct rb_node * node ;
146+
147+ mnt_ns_tree_write_lock ();
148+ node = rb_find_add_rcu (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_cmp );
149+ mnt_ns_tree_write_unlock ();
150+
151+ WARN_ON_ONCE (node );
139152}
140153
141154static void mnt_ns_release (struct mnt_namespace * ns )
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
150163}
151164DEFINE_FREE (mnt_ns_release , struct mnt_namespace * , if (_T ) mnt_ns_release (_T ))
152165
166+ static void mnt_ns_release_rcu (struct rcu_head * rcu )
167+ {
168+ mnt_ns_release (container_of (rcu , struct mnt_namespace , mnt_ns_rcu ));
169+ }
170+
153171static void mnt_ns_tree_remove (struct mnt_namespace * ns )
154172{
155173 /* remove from global mount namespace list */
156174 if (!is_anon_ns (ns )) {
157- guard ( write_lock )( & mnt_ns_tree_lock );
175+ mnt_ns_tree_write_lock ( );
158176 rb_erase (& ns -> mnt_ns_tree_node , & mnt_ns_tree );
177+ mnt_ns_tree_write_unlock ();
159178 }
160179
161- mnt_ns_release ( ns );
180+ call_rcu ( & ns -> mnt_ns_rcu , mnt_ns_release_rcu );
162181}
163182
164- /*
165- * Returns the mount namespace which either has the specified id, or has the
166- * next smallest id afer the specified one.
167- */
168- static struct mnt_namespace * mnt_ns_find_id_at (u64 mnt_ns_id )
183+ static int mnt_ns_find (const void * key , const struct rb_node * node )
169184{
170- struct rb_node * node = mnt_ns_tree . rb_node ;
171- struct mnt_namespace * ret = NULL ;
185+ const u64 mnt_ns_id = * ( u64 * ) key ;
186+ const struct mnt_namespace * ns = node_to_mnt_ns ( node ) ;
172187
173- lockdep_assert_held (& mnt_ns_tree_lock );
174-
175- while (node ) {
176- struct mnt_namespace * n = node_to_mnt_ns (node );
177-
178- if (mnt_ns_id <= n -> seq ) {
179- ret = node_to_mnt_ns (node );
180- if (mnt_ns_id == n -> seq )
181- break ;
182- node = node -> rb_left ;
183- } else {
184- node = node -> rb_right ;
185- }
186- }
187- return ret ;
188+ if (mnt_ns_id < ns -> seq )
189+ return -1 ;
190+ if (mnt_ns_id > ns -> seq )
191+ return 1 ;
192+ return 0 ;
188193}
189194
190195/*
@@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
194199 * namespace the @namespace_sem must first be acquired. If the namespace has
195200 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
196201 * see that the mount rbtree of the namespace is empty.
202+ *
203+ * Note the lookup is lockless protected by a sequence counter. We only
204+ * need to guard against false negatives as false positives aren't
205+ * possible. So if we didn't find a mount namespace and the sequence
206+ * counter has changed we need to retry. If the sequence counter is
207+ * still the same we know the search actually failed.
197208 */
198209static struct mnt_namespace * lookup_mnt_ns (u64 mnt_ns_id )
199210{
200- struct mnt_namespace * ns ;
211+ struct mnt_namespace * ns ;
212+ struct rb_node * node ;
213+ unsigned int seq ;
214+
215+ guard (rcu )();
216+ do {
217+ seq = read_seqcount_begin (& mnt_ns_tree_seqcount );
218+ node = rb_find_rcu (& mnt_ns_id , & mnt_ns_tree , mnt_ns_find );
219+ if (node )
220+ break ;
221+ } while (read_seqcount_retry (& mnt_ns_tree_seqcount , seq ));
201222
202- guard (read_lock )(& mnt_ns_tree_lock );
203- ns = mnt_ns_find_id_at (mnt_ns_id );
204- if (!ns || ns -> seq != mnt_ns_id )
205- return NULL ;
223+ if (!node )
224+ return NULL ;
206225
207- refcount_inc (& ns -> passive );
208- return ns ;
226+ /*
227+ * The last reference count is put with RCU delay so we can
228+ * unconditonally acquire a reference here.
229+ */
230+ ns = node_to_mnt_ns (node );
231+ refcount_inc (& ns -> passive );
232+ return ns ;
209233}
210234
211235static inline void lock_mount_hash (void )
0 commit comments