Skip to content

Commit 678379e

Browse files
author
Al Viro
committed
close_range(): fix the logics in descriptor table trimming
Cloning a descriptor table picks the size that would cover all currently opened files. That's fine for clone() and unshare(), but for close_range() there's an additional twist - we clone before we close, and it would be a shame to have close_range(3, ~0U, CLOSE_RANGE_UNSHARE) leave us with a huge descriptor table when we are not going to keep anything past stderr, just because some large file descriptor used to be open before our call has taken it out. Unfortunately, it had been dealt with in an inherently racy way - sane_fdtable_size() gets a "don't copy anything past that" argument (passed via unshare_fd() and dup_fd()), close_range() decides how much should be trimmed and passes that to unshare_fd(). The problem is, a range that used to extend to the end of descriptor table back when close_range() had looked at it might very well have stuff grown after it by the time dup_fd() has allocated a new files_struct and started to figure out the capacity of fdtable to be attached to that. That leads to interesting pathological cases; at the very least it's a QoI issue, since unshare(CLONE_FILES) is atomic in a sense that it takes a snapshot of descriptor table one might have observed at some point. Since CLOSE_RANGE_UNSHARE close_range() is supposed to be a combination of unshare(CLONE_FILES) with plain close_range(), ending up with a weird state that would never occur with unshare(2) is confusing, to put it mildly. It's not hard to get rid of - all it takes is passing both ends of the range down to sane_fdtable_size(). There we are under ->files_lock, so the race is trivially avoided. So we do the following: * switch close_files() from calling unshare_fd() to calling dup_fd(). * undo the calling convention change done to unshare_fd() in 60997c3 "close_range: add CLOSE_RANGE_UNSHARE" * introduce struct fd_range, pass a pointer to that to dup_fd() and sane_fdtable_size() instead of "trim everything past that point" they are currently getting. NULL means "we are not going to be punching any holes"; NR_OPEN_MAX is gone. * make sane_fdtable_size() use find_last_bit() instead of open-coding it; it's easier to follow that way. * while we are at it, have dup_fd() report errors by returning ERR_PTR(), no need to use a separate int *errorp argument. Fixes: 60997c3 "close_range: add CLOSE_RANGE_UNSHARE" Cc: [email protected] Signed-off-by: Al Viro <[email protected]>
1 parent 9852d85 commit 678379e

File tree

3 files changed

+52
-83
lines changed

3 files changed

+52
-83
lines changed

fs/file.c

Lines changed: 34 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -272,59 +272,45 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
272272
return test_bit(fd, fdt->open_fds);
273273
}
274274

275-
static unsigned int count_open_files(struct fdtable *fdt)
276-
{
277-
unsigned int size = fdt->max_fds;
278-
unsigned int i;
279-
280-
/* Find the last open fd */
281-
for (i = size / BITS_PER_LONG; i > 0; ) {
282-
if (fdt->open_fds[--i])
283-
break;
284-
}
285-
i = (i + 1) * BITS_PER_LONG;
286-
return i;
287-
}
288-
289275
/*
290276
* Note that a sane fdtable size always has to be a multiple of
291277
* BITS_PER_LONG, since we have bitmaps that are sized by this.
292278
*
293-
* 'max_fds' will normally already be properly aligned, but it
294-
* turns out that in the close_range() -> __close_range() ->
295-
* unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
296-
* up having a 'max_fds' value that isn't already aligned.
297-
*
298-
* Rather than make close_range() have to worry about this,
299-
* just make that BITS_PER_LONG alignment be part of a sane
300-
* fdtable size. Becuase that's really what it is.
279+
* punch_hole is optional - when close_range() is asked to unshare
280+
* and close, we don't need to copy descriptors in that range, so
281+
* a smaller cloned descriptor table might suffice if the last
282+
* currently opened descriptor falls into that range.
301283
*/
302-
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
284+
static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
303285
{
304-
unsigned int count;
305-
306-
count = count_open_files(fdt);
307-
if (max_fds < NR_OPEN_DEFAULT)
308-
max_fds = NR_OPEN_DEFAULT;
309-
return ALIGN(min(count, max_fds), BITS_PER_LONG);
286+
unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
287+
288+
if (last == fdt->max_fds)
289+
return NR_OPEN_DEFAULT;
290+
if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
291+
last = find_last_bit(fdt->open_fds, punch_hole->from);
292+
if (last == punch_hole->from)
293+
return NR_OPEN_DEFAULT;
294+
}
295+
return ALIGN(last + 1, BITS_PER_LONG);
310296
}
311297

312298
/*
313-
* Allocate a new files structure and copy contents from the
314-
* passed in files structure.
315-
* errorp will be valid only when the returned files_struct is NULL.
299+
* Allocate a new descriptor table and copy contents from the passed in
300+
* instance. Returns a pointer to cloned table on success, ERR_PTR()
301+
* on failure. For 'punch_hole' see sane_fdtable_size().
316302
*/
317-
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
303+
struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
318304
{
319305
struct files_struct *newf;
320306
struct file **old_fds, **new_fds;
321307
unsigned int open_files, i;
322308
struct fdtable *old_fdt, *new_fdt;
309+
int error;
323310

324-
*errorp = -ENOMEM;
325311
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
326312
if (!newf)
327-
goto out;
313+
return ERR_PTR(-ENOMEM);
328314

329315
atomic_set(&newf->count, 1);
330316

@@ -341,7 +327,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
341327

342328
spin_lock(&oldf->file_lock);
343329
old_fdt = files_fdtable(oldf);
344-
open_files = sane_fdtable_size(old_fdt, max_fds);
330+
open_files = sane_fdtable_size(old_fdt, punch_hole);
345331

346332
/*
347333
* Check whether we need to allocate a larger fd array and fd set.
@@ -354,14 +340,14 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
354340

355341
new_fdt = alloc_fdtable(open_files - 1);
356342
if (!new_fdt) {
357-
*errorp = -ENOMEM;
343+
error = -ENOMEM;
358344
goto out_release;
359345
}
360346

361347
/* beyond sysctl_nr_open; nothing to do */
362348
if (unlikely(new_fdt->max_fds < open_files)) {
363349
__free_fdtable(new_fdt);
364-
*errorp = -EMFILE;
350+
error = -EMFILE;
365351
goto out_release;
366352
}
367353

@@ -372,7 +358,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
372358
*/
373359
spin_lock(&oldf->file_lock);
374360
old_fdt = files_fdtable(oldf);
375-
open_files = sane_fdtable_size(old_fdt, max_fds);
361+
open_files = sane_fdtable_size(old_fdt, punch_hole);
376362
}
377363

378364
copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
@@ -406,8 +392,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
406392

407393
out_release:
408394
kmem_cache_free(files_cachep, newf);
409-
out:
410-
return NULL;
395+
return ERR_PTR(error);
411396
}
412397

413398
static struct fdtable *close_files(struct files_struct * files)
@@ -748,37 +733,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
748733
if (fd > max_fd)
749734
return -EINVAL;
750735

751-
if (flags & CLOSE_RANGE_UNSHARE) {
752-
int ret;
753-
unsigned int max_unshare_fds = NR_OPEN_MAX;
736+
if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
737+
struct fd_range range = {fd, max_fd}, *punch_hole = &range;
754738

755739
/*
756740
* If the caller requested all fds to be made cloexec we always
757741
* copy all of the file descriptors since they still want to
758742
* use them.
759743
*/
760-
if (!(flags & CLOSE_RANGE_CLOEXEC)) {
761-
/*
762-
* If the requested range is greater than the current
763-
* maximum, we're closing everything so only copy all
764-
* file descriptors beneath the lowest file descriptor.
765-
*/
766-
rcu_read_lock();
767-
if (max_fd >= last_fd(files_fdtable(cur_fds)))
768-
max_unshare_fds = fd;
769-
rcu_read_unlock();
770-
}
771-
772-
ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
773-
if (ret)
774-
return ret;
744+
if (flags & CLOSE_RANGE_CLOEXEC)
745+
punch_hole = NULL;
775746

747+
fds = dup_fd(cur_fds, punch_hole);
748+
if (IS_ERR(fds))
749+
return PTR_ERR(fds);
776750
/*
777751
* We used to share our file descriptor table, and have now
778752
* created a private one, make sure we're using it below.
779753
*/
780-
if (fds)
781-
swap(cur_fds, fds);
754+
swap(cur_fds, fds);
782755
}
783756

784757
if (flags & CLOSE_RANGE_CLOEXEC)

include/linux/fdtable.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
* as this is the granularity returned by copy_fdset().
2323
*/
2424
#define NR_OPEN_DEFAULT BITS_PER_LONG
25-
#define NR_OPEN_MAX ~0U
2625

2726
struct fdtable {
2827
unsigned int max_fds;
@@ -106,7 +105,10 @@ struct task_struct;
106105

107106
void put_files_struct(struct files_struct *fs);
108107
int unshare_files(void);
109-
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
108+
struct fd_range {
109+
unsigned int from, to;
110+
};
111+
struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy;
110112
void do_close_on_exec(struct files_struct *);
111113
int iterate_fd(struct files_struct *, unsigned,
112114
int (*)(const void *, struct file *, unsigned),
@@ -115,8 +117,6 @@ int iterate_fd(struct files_struct *, unsigned,
115117
extern int close_fd(unsigned int fd);
116118
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
117119
extern struct file *file_close_fd(unsigned int fd);
118-
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
119-
struct files_struct **new_fdp);
120120

121121
extern struct kmem_cache *files_cachep;
122122

kernel/fork.c

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1756,33 +1756,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
17561756
int no_files)
17571757
{
17581758
struct files_struct *oldf, *newf;
1759-
int error = 0;
17601759

17611760
/*
17621761
* A background process may not have any files ...
17631762
*/
17641763
oldf = current->files;
17651764
if (!oldf)
1766-
goto out;
1765+
return 0;
17671766

17681767
if (no_files) {
17691768
tsk->files = NULL;
1770-
goto out;
1769+
return 0;
17711770
}
17721771

17731772
if (clone_flags & CLONE_FILES) {
17741773
atomic_inc(&oldf->count);
1775-
goto out;
1774+
return 0;
17761775
}
17771776

1778-
newf = dup_fd(oldf, NR_OPEN_MAX, &error);
1779-
if (!newf)
1780-
goto out;
1777+
newf = dup_fd(oldf, NULL);
1778+
if (IS_ERR(newf))
1779+
return PTR_ERR(newf);
17811780

17821781
tsk->files = newf;
1783-
error = 0;
1784-
out:
1785-
return error;
1782+
return 0;
17861783
}
17871784

17881785
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
@@ -3238,17 +3235,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
32383235
/*
32393236
* Unshare file descriptor table if it is being shared
32403237
*/
3241-
int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
3242-
struct files_struct **new_fdp)
3238+
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
32433239
{
32443240
struct files_struct *fd = current->files;
3245-
int error = 0;
32463241

32473242
if ((unshare_flags & CLONE_FILES) &&
32483243
(fd && atomic_read(&fd->count) > 1)) {
3249-
*new_fdp = dup_fd(fd, max_fds, &error);
3250-
if (!*new_fdp)
3251-
return error;
3244+
fd = dup_fd(fd, NULL);
3245+
if (IS_ERR(fd))
3246+
return PTR_ERR(fd);
3247+
*new_fdp = fd;
32523248
}
32533249

32543250
return 0;
@@ -3306,7 +3302,7 @@ int ksys_unshare(unsigned long unshare_flags)
33063302
err = unshare_fs(unshare_flags, &new_fs);
33073303
if (err)
33083304
goto bad_unshare_out;
3309-
err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
3305+
err = unshare_fd(unshare_flags, &new_fd);
33103306
if (err)
33113307
goto bad_unshare_cleanup_fs;
33123308
err = unshare_userns(unshare_flags, &new_cred);
@@ -3398,7 +3394,7 @@ int unshare_files(void)
33983394
struct files_struct *old, *copy = NULL;
33993395
int error;
34003396

3401-
error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
3397+
error = unshare_fd(CLONE_FILES, &copy);
34023398
if (error || !copy)
34033399
return error;
34043400

0 commit comments

Comments
 (0)