@@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
7878static DECLARE_RWSEM (namespace_sem );
7979static HLIST_HEAD (unmounted ); /* protected by namespace_sem */
8080static LIST_HEAD (ex_mountpoints ); /* protected by namespace_sem */
81+ static DEFINE_RWLOCK (mnt_ns_tree_lock );
82+ static struct rb_root mnt_ns_tree = RB_ROOT ; /* protected by mnt_ns_tree_lock */
8183
8284struct mount_kattr {
8385 unsigned int attr_set ;
@@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
103105 */
104106__cacheline_aligned_in_smp DEFINE_SEQLOCK (mount_lock );
105107
108+ static int mnt_ns_cmp (u64 seq , const struct mnt_namespace * ns )
109+ {
110+ u64 seq_b = ns -> seq ;
111+
112+ if (seq < seq_b )
113+ return -1 ;
114+ if (seq > seq_b )
115+ return 1 ;
116+ return 0 ;
117+ }
118+
119+ static inline struct mnt_namespace * node_to_mnt_ns (const struct rb_node * node )
120+ {
121+ if (!node )
122+ return NULL ;
123+ return rb_entry (node , struct mnt_namespace , mnt_ns_tree_node );
124+ }
125+
126+ static bool mnt_ns_less (struct rb_node * a , const struct rb_node * b )
127+ {
128+ struct mnt_namespace * ns_a = node_to_mnt_ns (a );
129+ struct mnt_namespace * ns_b = node_to_mnt_ns (b );
130+ u64 seq_a = ns_a -> seq ;
131+
132+ return mnt_ns_cmp (seq_a , ns_b ) < 0 ;
133+ }
134+
135+ static void mnt_ns_tree_add (struct mnt_namespace * ns )
136+ {
137+ guard (write_lock )(& mnt_ns_tree_lock );
138+ rb_add (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_less );
139+ }
140+
141+ static void mnt_ns_release (struct mnt_namespace * ns )
142+ {
143+ lockdep_assert_not_held (& mnt_ns_tree_lock );
144+
145+ /* keep alive for {list,stat}mount() */
146+ if (refcount_dec_and_test (& ns -> passive )) {
147+ put_user_ns (ns -> user_ns );
148+ kfree (ns );
149+ }
150+ }
151+ DEFINE_FREE (mnt_ns_release , struct mnt_namespace * , if (_T ) mnt_ns_release (_T ))
152+
153+ static void mnt_ns_tree_remove (struct mnt_namespace * ns )
154+ {
155+ /* remove from global mount namespace list */
156+ if (!is_anon_ns (ns )) {
157+ guard (write_lock )(& mnt_ns_tree_lock );
158+ rb_erase (& ns -> mnt_ns_tree_node , & mnt_ns_tree );
159+ }
160+
161+ mnt_ns_release (ns );
162+ }
163+
164+ /*
165+ * Returns the mount namespace which either has the specified id, or has the
166+ * next smallest id afer the specified one.
167+ */
168+ static struct mnt_namespace * mnt_ns_find_id_at (u64 mnt_ns_id )
169+ {
170+ struct rb_node * node = mnt_ns_tree .rb_node ;
171+ struct mnt_namespace * ret = NULL ;
172+
173+ lockdep_assert_held (& mnt_ns_tree_lock );
174+
175+ while (node ) {
176+ struct mnt_namespace * n = node_to_mnt_ns (node );
177+
178+ if (mnt_ns_id <= n -> seq ) {
179+ ret = node_to_mnt_ns (node );
180+ if (mnt_ns_id == n -> seq )
181+ break ;
182+ node = node -> rb_left ;
183+ } else {
184+ node = node -> rb_right ;
185+ }
186+ }
187+ return ret ;
188+ }
189+
190+ /*
191+ * Lookup a mount namespace by id and take a passive reference count. Taking a
192+ * passive reference means the mount namespace can be emptied if e.g., the last
193+ * task holding an active reference exits. To access the mounts of the
194+ * namespace the @namespace_sem must first be acquired. If the namespace has
195+ * already shut down before acquiring @namespace_sem, {list,stat}mount() will
196+ * see that the mount rbtree of the namespace is empty.
197+ */
198+ static struct mnt_namespace * lookup_mnt_ns (u64 mnt_ns_id )
199+ {
200+ struct mnt_namespace * ns ;
201+
202+ guard (read_lock )(& mnt_ns_tree_lock );
203+ ns = mnt_ns_find_id_at (mnt_ns_id );
204+ if (!ns || ns -> seq != mnt_ns_id )
205+ return NULL ;
206+
207+ refcount_inc (& ns -> passive );
208+ return ns ;
209+ }
210+
106211static inline void lock_mount_hash (void )
107212{
108213 write_seqlock (& mount_lock );
@@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
37333838 if (!is_anon_ns (ns ))
37343839 ns_free_inum (& ns -> ns );
37353840 dec_mnt_namespaces (ns -> ucounts );
3736- put_user_ns (ns -> user_ns );
3737- kfree (ns );
3841+ mnt_ns_tree_remove (ns );
37383842}
37393843
37403844/*
@@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
37733877 if (!anon )
37743878 new_ns -> seq = atomic64_add_return (1 , & mnt_ns_seq );
37753879 refcount_set (& new_ns -> ns .count , 1 );
3880+ refcount_set (& new_ns -> passive , 1 );
37763881 new_ns -> mounts = RB_ROOT ;
3882+ RB_CLEAR_NODE (& new_ns -> mnt_ns_tree_node );
37773883 init_waitqueue_head (& new_ns -> poll );
37783884 new_ns -> user_ns = get_user_ns (user_ns );
37793885 new_ns -> ucounts = ucounts ;
@@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
38503956 while (p -> mnt .mnt_root != q -> mnt .mnt_root )
38513957 p = next_mnt (skip_mnt_tree (p ), old );
38523958 }
3959+ mnt_ns_tree_add (new_ns );
38533960 namespace_unlock ();
38543961
38553962 if (rootmnt )
@@ -4867,6 +4974,12 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
48674974 return 0 ;
48684975}
48694976
4977+ static void statmount_mnt_ns_id (struct kstatmount * s , struct mnt_namespace * ns )
4978+ {
4979+ s -> sm .mask |= STATMOUNT_MNT_NS_ID ;
4980+ s -> sm .mnt_ns_id = ns -> seq ;
4981+ }
4982+
48704983static int statmount_string (struct kstatmount * s , u64 flag )
48714984{
48724985 int ret ;
@@ -4930,14 +5043,15 @@ static int copy_statmount_to_user(struct kstatmount *s)
49305043static int do_statmount (struct kstatmount * s )
49315044{
49325045 struct mount * m = real_mount (s -> mnt );
5046+ struct mnt_namespace * ns = m -> mnt_ns ;
49335047 int err ;
49345048
49355049 /*
49365050 * Don't trigger audit denials. We just want to determine what
49375051 * mounts to show users.
49385052 */
49395053 if (!is_path_reachable (m , m -> mnt .mnt_root , & s -> root ) &&
4940- !ns_capable_noaudit (& init_user_ns , CAP_SYS_ADMIN ))
5054+ !ns_capable_noaudit (ns -> user_ns , CAP_SYS_ADMIN ))
49415055 return - EPERM ;
49425056
49435057 err = security_sb_statfs (s -> mnt -> mnt_root );
@@ -4962,6 +5076,9 @@ static int do_statmount(struct kstatmount *s)
49625076 if (!err && s -> mask & STATMOUNT_MNT_POINT )
49635077 err = statmount_string (s , STATMOUNT_MNT_POINT );
49645078
5079+ if (!err && s -> mask & STATMOUNT_MNT_NS_ID )
5080+ statmount_mnt_ns_id (s , ns );
5081+
49655082 if (err )
49665083 return err ;
49675084
@@ -5003,7 +5120,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
50035120 int ret ;
50045121 size_t usize ;
50055122
5006- BUILD_BUG_ON (sizeof (struct mnt_id_req ) != MNT_ID_REQ_SIZE_VER0 );
5123+ BUILD_BUG_ON (sizeof (struct mnt_id_req ) != MNT_ID_REQ_SIZE_VER1 );
50075124
50085125 ret = get_user (usize , & req -> size );
50095126 if (ret )
@@ -5021,10 +5138,63 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
50215138 return 0 ;
50225139}
50235140
5141+ static struct mount * listmnt_next (struct mount * curr , bool reverse )
5142+ {
5143+ struct rb_node * node ;
5144+
5145+ if (reverse )
5146+ node = rb_prev (& curr -> mnt_node );
5147+ else
5148+ node = rb_next (& curr -> mnt_node );
5149+
5150+ return node_to_mount (node );
5151+ }
5152+
5153+ static int grab_requested_root (struct mnt_namespace * ns , struct path * root )
5154+ {
5155+ struct mount * first ;
5156+
5157+ rwsem_assert_held (& namespace_sem );
5158+
5159+ /* We're looking at our own ns, just use get_fs_root. */
5160+ if (ns == current -> nsproxy -> mnt_ns ) {
5161+ get_fs_root (current -> fs , root );
5162+ return 0 ;
5163+ }
5164+
5165+ /*
5166+ * We have to find the first mount in our ns and use that, however it
5167+ * may not exist, so handle that properly.
5168+ */
5169+ if (RB_EMPTY_ROOT (& ns -> mounts ))
5170+ return - ENOENT ;
5171+
5172+ first = listmnt_next (ns -> root , false);
5173+ if (!first )
5174+ return - ENOENT ;
5175+ root -> mnt = mntget (& first -> mnt );
5176+ root -> dentry = dget (root -> mnt -> mnt_root );
5177+ return 0 ;
5178+ }
5179+
5180+ /*
5181+ * If the user requested a specific mount namespace id, look that up and return
5182+ * that, or if not simply grab a passive reference on our mount namespace and
5183+ * return that.
5184+ */
5185+ static struct mnt_namespace * grab_requested_mnt_ns (u64 mnt_ns_id )
5186+ {
5187+ if (mnt_ns_id )
5188+ return lookup_mnt_ns (mnt_ns_id );
5189+ refcount_inc (& current -> nsproxy -> mnt_ns -> passive );
5190+ return current -> nsproxy -> mnt_ns ;
5191+ }
5192+
50245193SYSCALL_DEFINE4 (statmount , const struct mnt_id_req __user * , req ,
50255194 struct statmount __user * , buf , size_t , bufsize ,
50265195 unsigned int , flags )
50275196{
5197+ struct mnt_namespace * ns __free (mnt_ns_release ) = NULL ;
50285198 struct vfsmount * mnt ;
50295199 struct mnt_id_req kreq ;
50305200 struct kstatmount ks ;
@@ -5039,21 +5209,41 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
50395209 if (ret )
50405210 return ret ;
50415211
5212+ ns = grab_requested_mnt_ns (kreq .mnt_ns_id );
5213+ if (!ns )
5214+ return - ENOENT ;
5215+
5216+ if (kreq .mnt_ns_id && (ns != current -> nsproxy -> mnt_ns ) &&
5217+ !ns_capable_noaudit (ns -> user_ns , CAP_SYS_ADMIN ))
5218+ return - ENOENT ;
5219+
50425220retry :
50435221 ret = prepare_kstatmount (& ks , & kreq , buf , bufsize , seq_size );
50445222 if (ret )
50455223 return ret ;
50465224
50475225 down_read (& namespace_sem );
5048- mnt = lookup_mnt_in_ns (kreq .mnt_id , current -> nsproxy -> mnt_ns );
5226+ /* Has the namespace already been emptied? */
5227+ if (kreq .mnt_ns_id && RB_EMPTY_ROOT (& ns -> mounts )) {
5228+ up_read (& namespace_sem );
5229+ kvfree (ks .seq .buf );
5230+ return - ENOENT ;
5231+ }
5232+
5233+ mnt = lookup_mnt_in_ns (kreq .mnt_id , ns );
50495234 if (!mnt ) {
50505235 up_read (& namespace_sem );
50515236 kvfree (ks .seq .buf );
50525237 return - ENOENT ;
50535238 }
50545239
50555240 ks .mnt = mnt ;
5056- get_fs_root (current -> fs , & ks .root );
5241+ ret = grab_requested_root (ns , & ks .root );
5242+ if (ret ) {
5243+ up_read (& namespace_sem );
5244+ kvfree (ks .seq .buf );
5245+ return ret ;
5246+ }
50575247 ret = do_statmount (& ks );
50585248 path_put (& ks .root );
50595249 up_read (& namespace_sem );
@@ -5066,30 +5256,21 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
50665256 return ret ;
50675257}
50685258
5069- static struct mount * listmnt_next (struct mount * curr , bool reverse )
5070- {
5071- struct rb_node * node ;
5072-
5073- if (reverse )
5074- node = rb_prev (& curr -> mnt_node );
5075- else
5076- node = rb_next (& curr -> mnt_node );
5077-
5078- return node_to_mount (node );
5079- }
5080-
5081- static ssize_t do_listmount (u64 mnt_parent_id , u64 last_mnt_id , u64 * mnt_ids ,
5082- size_t nr_mnt_ids , bool reverse )
5259+ static ssize_t do_listmount (struct mnt_namespace * ns , u64 mnt_parent_id ,
5260+ u64 last_mnt_id , u64 * mnt_ids , size_t nr_mnt_ids ,
5261+ bool reverse )
50835262{
50845263 struct path root __free (path_put ) = {};
5085- struct mnt_namespace * ns = current -> nsproxy -> mnt_ns ;
50865264 struct path orig ;
50875265 struct mount * r , * first ;
50885266 ssize_t ret ;
50895267
50905268 rwsem_assert_held (& namespace_sem );
50915269
5092- get_fs_root (current -> fs , & root );
5270+ ret = grab_requested_root (ns , & root );
5271+ if (ret )
5272+ return ret ;
5273+
50935274 if (mnt_parent_id == LSMT_ROOT ) {
50945275 orig = root ;
50955276 } else {
@@ -5104,7 +5285,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
51045285 * mounts to show users.
51055286 */
51065287 if (!is_path_reachable (real_mount (orig .mnt ), orig .dentry , & root ) &&
5107- !ns_capable_noaudit (& init_user_ns , CAP_SYS_ADMIN ))
5288+ !ns_capable_noaudit (ns -> user_ns , CAP_SYS_ADMIN ))
51085289 return - EPERM ;
51095290
51105291 ret = security_sb_statfs (orig .dentry );
@@ -5141,6 +5322,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
51415322{
51425323 u64 * kmnt_ids __free (kvfree ) = NULL ;
51435324 const size_t maxcount = 1000000 ;
5325+ struct mnt_namespace * ns __free (mnt_ns_release ) = NULL ;
51445326 struct mnt_id_req kreq ;
51455327 ssize_t ret ;
51465328
@@ -5167,8 +5349,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
51675349 if (!kmnt_ids )
51685350 return - ENOMEM ;
51695351
5352+ ns = grab_requested_mnt_ns (kreq .mnt_ns_id );
5353+ if (!ns )
5354+ return - ENOENT ;
5355+
5356+ if (kreq .mnt_ns_id && (ns != current -> nsproxy -> mnt_ns ) &&
5357+ !ns_capable_noaudit (ns -> user_ns , CAP_SYS_ADMIN ))
5358+ return - ENOENT ;
5359+
51705360 scoped_guard (rwsem_read , & namespace_sem )
5171- ret = do_listmount (kreq .mnt_id , kreq .param , kmnt_ids ,
5361+ ret = do_listmount (ns , kreq .mnt_id , kreq .param , kmnt_ids ,
51725362 nr_mnt_ids , (flags & LISTMOUNT_REVERSE ));
51735363
51745364 if (copy_to_user (mnt_ids , kmnt_ids , ret * sizeof (* mnt_ids )))
@@ -5204,6 +5394,8 @@ static void __init init_mount_tree(void)
52045394
52055395 set_fs_pwd (current -> fs , & root );
52065396 set_fs_root (current -> fs , & root );
5397+
5398+ mnt_ns_tree_add (ns );
52075399}
52085400
52095401void __init mnt_init (void )
0 commit comments