Skip to content

Commit 9b93f50

Browse files
committed
Merge tag 'fs.idmapped.mnt_idmap.v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping
Pull idmapping updates from Christian Brauner: "Last cycle we've already made the interaction with idmapped mounts more robust and type safe by introducing the vfs{g,u}id_t type. This cycle we concluded the conversion and removed the legacy helpers. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem - with namespaces that are relevent on the mount level. Especially for filesystem developers without detailed knowledge in this area this can be a potential source for bugs. Instead of passing the plain namespace we introduce a dedicated type struct mnt_idmap and replace the pointer with a pointer to a struct mnt_idmap. There are no semantic or size changes for the mount struct caused by this. We then start converting all places aware of idmapped mounts to rely on struct mnt_idmap. Once the conversion is done all helpers down to the really low-level make_vfs{g,u}id() and from_vfs{g,u}id() will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two removing and thus eliminating the possibility of any bugs. Fwiw, I fixed some issues in that area a while ago in ntfs3 and ksmbd in the past. Afterwards only low-level code can ultimately use the associated namespace for any permission checks. Even most of the vfs can be completely obivious about this ultimately and filesystems will never interact with it in any form in the future. A struct mnt_idmap currently encompasses a simple refcount and pointer to the relevant namespace the mount is idmapped to. If a mount isn't idmapped then it will point to a static nop_mnt_idmap and if it doesn't that it is idmapped. As usual there are no allocations or anything happening for non-idmapped mounts. Everthing is carefully written to be a nop for non-idmapped mounts as has always been the case. If an idmapped mount is created a struct mnt_idmap is allocated and a reference taken on the relevant namespace. Each mount that gets idmapped or inherits the idmap simply bumps the reference count on struct mnt_idmap. Just a reminder that we only allow a mount to change it's idmapping a single time and only if it hasn't already been attached to the filesystems and has no active writers. The actual changes are fairly straightforward but this will have huge benefits for maintenance and security in the long run even if it causes some churn. Note that this also makes it possible to extend struct mount_idmap in the future. For example, it would be possible to place the namespace pointer in an anonymous union together with an idmapping struct. This would allow us to expose an api to userspace that would let it specify idmappings directly instead of having to go through the detour of setting up namespaces at all" * tag 'fs.idmapped.mnt_idmap.v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping: acl: conver higher-level helpers to rely on mnt_idmap fs: introduce dedicated idmap type for mounts
2 parents e1212e9 + 5a6f52d commit 9b93f50

File tree

8 files changed

+196
-82
lines changed

8 files changed

+196
-82
lines changed

fs/internal.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -227,28 +227,28 @@ struct xattr_ctx {
227227
};
228228

229229

230-
ssize_t do_getxattr(struct user_namespace *mnt_userns,
230+
ssize_t do_getxattr(struct mnt_idmap *idmap,
231231
struct dentry *d,
232232
struct xattr_ctx *ctx);
233233

234234
int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
235-
int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
235+
int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
236236
struct xattr_ctx *ctx);
237237
int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
238238

239239
#ifdef CONFIG_FS_POSIX_ACL
240-
int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
240+
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
241241
const char *acl_name, const void *kvalue, size_t size);
242-
ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
242+
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
243243
const char *acl_name, void *kvalue, size_t size);
244244
#else
245-
static inline int do_set_acl(struct user_namespace *mnt_userns,
245+
static inline int do_set_acl(struct mnt_idmap *idmap,
246246
struct dentry *dentry, const char *acl_name,
247247
const void *kvalue, size_t size)
248248
{
249249
return -EOPNOTSUPP;
250250
}
251-
static inline ssize_t do_get_acl(struct user_namespace *mnt_userns,
251+
static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
252252
struct dentry *dentry, const char *acl_name,
253253
void *kvalue, size_t size)
254254
{

fs/namespace.c

Lines changed: 142 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,30 @@ static DECLARE_RWSEM(namespace_sem);
7575
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
7676
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
7777

78+
struct mnt_idmap {
79+
struct user_namespace *owner;
80+
refcount_t count;
81+
};
82+
83+
/*
84+
* Carries the initial idmapping of 0:0:4294967295 which is an identity
85+
* mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
86+
* mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
87+
*/
88+
struct mnt_idmap nop_mnt_idmap = {
89+
.owner = &init_user_ns,
90+
.count = REFCOUNT_INIT(1),
91+
};
92+
EXPORT_SYMBOL_GPL(nop_mnt_idmap);
93+
7894
struct mount_kattr {
7995
unsigned int attr_set;
8096
unsigned int attr_clr;
8197
unsigned int propagation;
8298
unsigned int lookup_flags;
8399
bool recurse;
84100
struct user_namespace *mnt_userns;
101+
struct mnt_idmap *mnt_idmap;
85102
};
86103

87104
/* /sys/fs */
@@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt)
193210
#endif
194211
}
195212

213+
/**
214+
* mnt_idmap_owner - retrieve owner of the mount's idmapping
215+
* @idmap: mount idmapping
216+
*
217+
* This helper will go away once the conversion to use struct mnt_idmap
218+
* everywhere has finished at which point the helper will be unexported.
219+
*
220+
* Only code that needs to perform permission checks based on the owner of the
221+
* idmapping will get access to it. All other code will solely rely on
222+
* idmappings. This will get us type safety so it's impossible to conflate
223+
* filesystems idmappings with mount idmappings.
224+
*
225+
* Return: The owner of the idmapping.
226+
*/
227+
struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap)
228+
{
229+
return idmap->owner;
230+
}
231+
EXPORT_SYMBOL_GPL(mnt_idmap_owner);
232+
233+
/**
234+
* mnt_user_ns - retrieve owner of an idmapped mount
235+
* @mnt: the relevant vfsmount
236+
*
237+
* This helper will go away once the conversion to use struct mnt_idmap
238+
* everywhere has finished at which point the helper will be unexported.
239+
*
240+
* Only code that needs to perform permission checks based on the owner of the
241+
* idmapping will get access to it. All other code will solely rely on
242+
* idmappings. This will get us type safety so it's impossible to conflate
243+
* filesystems idmappings with mount idmappings.
244+
*
245+
* Return: The owner of the idmapped.
246+
*/
247+
struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
248+
{
249+
struct mnt_idmap *idmap = mnt_idmap(mnt);
250+
251+
/* Return the actual owner of the filesystem instead of the nop. */
252+
if (idmap == &nop_mnt_idmap &&
253+
!initial_idmapping(mnt->mnt_sb->s_user_ns))
254+
return mnt->mnt_sb->s_user_ns;
255+
return mnt_idmap_owner(idmap);
256+
}
257+
EXPORT_SYMBOL_GPL(mnt_user_ns);
258+
259+
/**
260+
* alloc_mnt_idmap - allocate a new idmapping for the mount
261+
* @mnt_userns: owning userns of the idmapping
262+
*
263+
* Allocate a new struct mnt_idmap which carries the idmapping of the mount.
264+
*
265+
* Return: On success a new idmap, on error an error pointer is returned.
266+
*/
267+
static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
268+
{
269+
struct mnt_idmap *idmap;
270+
271+
idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
272+
if (!idmap)
273+
return ERR_PTR(-ENOMEM);
274+
275+
idmap->owner = get_user_ns(mnt_userns);
276+
refcount_set(&idmap->count, 1);
277+
return idmap;
278+
}
279+
280+
/**
281+
* mnt_idmap_get - get a reference to an idmapping
282+
* @idmap: the idmap to bump the reference on
283+
*
284+
* If @idmap is not the @nop_mnt_idmap bump the reference count.
285+
*
286+
* Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
287+
*/
288+
static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
289+
{
290+
if (idmap != &nop_mnt_idmap)
291+
refcount_inc(&idmap->count);
292+
293+
return idmap;
294+
}
295+
296+
/**
297+
* mnt_idmap_put - put a reference to an idmapping
298+
* @idmap: the idmap to put the reference on
299+
*
300+
* If this is a non-initial idmapping, put the reference count when a mount is
301+
* released and free it if we're the last user.
302+
*/
303+
static inline void mnt_idmap_put(struct mnt_idmap *idmap)
304+
{
305+
if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
306+
put_user_ns(idmap->owner);
307+
kfree(idmap);
308+
}
309+
}
310+
196311
static struct mount *alloc_vfsmnt(const char *name)
197312
{
198313
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name)
232347
INIT_HLIST_NODE(&mnt->mnt_mp_list);
233348
INIT_LIST_HEAD(&mnt->mnt_umounting);
234349
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
235-
mnt->mnt.mnt_userns = &init_user_ns;
350+
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
236351
}
237352
return mnt;
238353

@@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
602717

603718
static void free_vfsmnt(struct mount *mnt)
604719
{
605-
struct user_namespace *mnt_userns;
606-
607-
mnt_userns = mnt_user_ns(&mnt->mnt);
608-
if (!initial_idmapping(mnt_userns))
609-
put_user_ns(mnt_userns);
720+
mnt_idmap_put(mnt_idmap(&mnt->mnt));
610721
kfree_const(mnt->mnt_devname);
611722
#ifdef CONFIG_SMP
612723
free_percpu(mnt->mnt_pcp);
@@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p)
10091120
struct vfsmount *vfs_create_mount(struct fs_context *fc)
10101121
{
10111122
struct mount *mnt;
1012-
struct user_namespace *fs_userns;
10131123

10141124
if (!fc->root)
10151125
return ERR_PTR(-EINVAL);
@@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
10271137
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
10281138
mnt->mnt_parent = mnt;
10291139

1030-
fs_userns = mnt->mnt.mnt_sb->s_user_ns;
1031-
if (!initial_idmapping(fs_userns))
1032-
mnt->mnt.mnt_userns = get_user_ns(fs_userns);
1033-
10341140
lock_mount_hash();
10351141
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
10361142
unlock_mount_hash();
@@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
11201226
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
11211227

11221228
atomic_inc(&sb->s_active);
1123-
mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
1124-
if (!initial_idmapping(mnt->mnt.mnt_userns))
1125-
mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
1229+
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
1230+
11261231
mnt->mnt.mnt_sb = sb;
11271232
mnt->mnt.mnt_root = dget(root);
11281233
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
@@ -3982,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
39824087
struct vfsmount *m = &mnt->mnt;
39834088
struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
39844089

3985-
if (!kattr->mnt_userns)
4090+
if (!kattr->mnt_idmap)
39864091
return 0;
39874092

39884093
/*
39894094
* Creating an idmapped mount with the filesystem wide idmapping
39904095
* doesn't make sense so block that. We don't allow mushy semantics.
39914096
*/
3992-
if (kattr->mnt_userns == fs_userns)
4097+
if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns)
39934098
return -EINVAL;
39944099

39954100
/*
@@ -4029,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
40294134
{
40304135
return (!(kattr->attr_set & MNT_READONLY) ||
40314136
(mnt->mnt.mnt_flags & MNT_READONLY)) &&
4032-
!kattr->mnt_userns;
4137+
!kattr->mnt_idmap;
40334138
}
40344139

40354140
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
@@ -4083,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
40834188

40844189
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
40854190
{
4086-
struct user_namespace *mnt_userns, *old_mnt_userns;
4087-
4088-
if (!kattr->mnt_userns)
4191+
if (!kattr->mnt_idmap)
40894192
return;
40904193

40914194
/*
4092-
* We're the only ones able to change the mount's idmapping. So
4093-
* mnt->mnt.mnt_userns is stable and we can retrieve it directly.
4094-
*/
4095-
old_mnt_userns = mnt->mnt.mnt_userns;
4096-
4097-
mnt_userns = get_user_ns(kattr->mnt_userns);
4098-
/* Pairs with smp_load_acquire() in mnt_user_ns(). */
4099-
smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
4100-
4101-
/*
4102-
* If this is an idmapped filesystem drop the reference we've taken
4103-
* in vfs_create_mount() before.
4195+
* Pairs with smp_load_acquire() in mnt_idmap().
4196+
*
4197+
* Since we only allow a mount to change the idmapping once and
4198+
* verified this in can_idmap_mount() we know that the mount has
4199+
* @nop_mnt_idmap attached to it. So there's no need to drop any
4200+
* references.
41044201
*/
4105-
if (!initial_idmapping(old_mnt_userns))
4106-
put_user_ns(old_mnt_userns);
4202+
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
41074203
}
41084204

41094205
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4137,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
41374233
if (path->dentry != mnt->mnt.mnt_root)
41384234
return -EINVAL;
41394235

4236+
if (kattr->mnt_userns) {
4237+
struct mnt_idmap *mnt_idmap;
4238+
4239+
mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
4240+
if (IS_ERR(mnt_idmap))
4241+
return PTR_ERR(mnt_idmap);
4242+
kattr->mnt_idmap = mnt_idmap;
4243+
}
4244+
41404245
if (kattr->propagation) {
41414246
/*
41424247
* Only take namespace_lock() if we're actually changing
@@ -4324,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
43244429
{
43254430
put_user_ns(kattr->mnt_userns);
43264431
kattr->mnt_userns = NULL;
4432+
4433+
if (kattr->mnt_idmap)
4434+
mnt_idmap_put(kattr->mnt_idmap);
43274435
}
43284436

43294437
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,

fs/posix_acl.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,7 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
871871

872872
/**
873873
* vfs_posix_acl_to_xattr - convert from kernel to userspace representation
874-
* @mnt_userns: user namespace of the mount
874+
* @idmap: idmap of the mount
875875
* @inode: inode the posix acls are set on
876876
* @acl: the posix acls as represented by the vfs
877877
* @buffer: the buffer into which to convert @acl
@@ -884,7 +884,7 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
884884
* Return: On success, the size of the stored uapi posix acls, on error a
885885
* negative errno.
886886
*/
887-
static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
887+
static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
888888
struct inode *inode,
889889
const struct posix_acl *acl, void *buffer,
890890
size_t size)
@@ -893,6 +893,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
893893
struct posix_acl_xattr_header *ext_acl = buffer;
894894
struct posix_acl_xattr_entry *ext_entry;
895895
struct user_namespace *fs_userns, *caller_userns;
896+
struct user_namespace *mnt_userns;
896897
ssize_t real_size, n;
897898
vfsuid_t vfsuid;
898899
vfsgid_t vfsgid;
@@ -908,6 +909,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
908909

909910
fs_userns = i_user_ns(inode);
910911
caller_userns = current_user_ns();
912+
mnt_userns = mnt_idmap_owner(idmap);
911913
for (n=0; n < acl->a_count; n++, ext_entry++) {
912914
const struct posix_acl_entry *acl_e = &acl->a_entries[n];
913915
ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
@@ -1227,7 +1229,7 @@ int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
12271229
}
12281230
EXPORT_SYMBOL_GPL(vfs_remove_acl);
12291231

1230-
int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
1232+
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
12311233
const char *acl_name, const void *kvalue, size_t size)
12321234
{
12331235
int error;
@@ -1243,22 +1245,22 @@ int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
12431245
return PTR_ERR(acl);
12441246
}
12451247

1246-
error = vfs_set_acl(mnt_userns, dentry, acl_name, acl);
1248+
error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl);
12471249
posix_acl_release(acl);
12481250
return error;
12491251
}
12501252

1251-
ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
1253+
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
12521254
const char *acl_name, void *kvalue, size_t size)
12531255
{
12541256
ssize_t error;
12551257
struct posix_acl *acl;
12561258

1257-
acl = vfs_get_acl(mnt_userns, dentry, acl_name);
1259+
acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name);
12581260
if (IS_ERR(acl))
12591261
return PTR_ERR(acl);
12601262

1261-
error = vfs_posix_acl_to_xattr(mnt_userns, d_inode(dentry),
1263+
error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
12621264
acl, kvalue, size);
12631265
posix_acl_release(acl);
12641266
return error;

0 commit comments

Comments
 (0)