@@ -272,59 +272,45 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
272272 return test_bit (fd , fdt -> open_fds );
273273}
274274
275- static unsigned int count_open_files (struct fdtable * fdt )
276- {
277- unsigned int size = fdt -> max_fds ;
278- unsigned int i ;
279-
280- /* Find the last open fd */
281- for (i = size / BITS_PER_LONG ; i > 0 ; ) {
282- if (fdt -> open_fds [-- i ])
283- break ;
284- }
285- i = (i + 1 ) * BITS_PER_LONG ;
286- return i ;
287- }
288-
289275/*
290276 * Note that a sane fdtable size always has to be a multiple of
291277 * BITS_PER_LONG, since we have bitmaps that are sized by this.
292278 *
293- * 'max_fds' will normally already be properly aligned, but it
294- * turns out that in the close_range() -> __close_range() ->
295- * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
296- * up having a 'max_fds' value that isn't already aligned.
297- *
298- * Rather than make close_range() have to worry about this,
299- * just make that BITS_PER_LONG alignment be part of a sane
300- * fdtable size. Becuase that's really what it is.
279+ * punch_hole is optional - when close_range() is asked to unshare
280+ * and close, we don't need to copy descriptors in that range, so
281+ * a smaller cloned descriptor table might suffice if the last
282+ * currently opened descriptor falls into that range.
301283 */
302- static unsigned int sane_fdtable_size (struct fdtable * fdt , unsigned int max_fds )
284+ static unsigned int sane_fdtable_size (struct fdtable * fdt , struct fd_range * punch_hole )
303285{
304- unsigned int count ;
305-
306- count = count_open_files (fdt );
307- if (max_fds < NR_OPEN_DEFAULT )
308- max_fds = NR_OPEN_DEFAULT ;
309- return ALIGN (min (count , max_fds ), BITS_PER_LONG );
286+ unsigned int last = find_last_bit (fdt -> open_fds , fdt -> max_fds );
287+
288+ if (last == fdt -> max_fds )
289+ return NR_OPEN_DEFAULT ;
290+ if (punch_hole && punch_hole -> to >= last && punch_hole -> from <= last ) {
291+ last = find_last_bit (fdt -> open_fds , punch_hole -> from );
292+ if (last == punch_hole -> from )
293+ return NR_OPEN_DEFAULT ;
294+ }
295+ return ALIGN (last + 1 , BITS_PER_LONG );
310296}
311297
312298/*
313- * Allocate a new files structure and copy contents from the
314- * passed in files structure.
315- * errorp will be valid only when the returned files_struct is NULL .
299+ * Allocate a new descriptor table and copy contents from the passed in
300+ * instance. Returns a pointer to cloned table on success, ERR_PTR()
301+ * on failure. For 'punch_hole' see sane_fdtable_size() .
316302 */
317- struct files_struct * dup_fd (struct files_struct * oldf , unsigned int max_fds , int * errorp )
303+ struct files_struct * dup_fd (struct files_struct * oldf , struct fd_range * punch_hole )
318304{
319305 struct files_struct * newf ;
320306 struct file * * old_fds , * * new_fds ;
321307 unsigned int open_files , i ;
322308 struct fdtable * old_fdt , * new_fdt ;
309+ int error ;
323310
324- * errorp = - ENOMEM ;
325311 newf = kmem_cache_alloc (files_cachep , GFP_KERNEL );
326312 if (!newf )
327- goto out ;
313+ return ERR_PTR ( - ENOMEM ) ;
328314
329315 atomic_set (& newf -> count , 1 );
330316
@@ -341,7 +327,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
341327
342328 spin_lock (& oldf -> file_lock );
343329 old_fdt = files_fdtable (oldf );
344- open_files = sane_fdtable_size (old_fdt , max_fds );
330+ open_files = sane_fdtable_size (old_fdt , punch_hole );
345331
346332 /*
347333 * Check whether we need to allocate a larger fd array and fd set.
@@ -354,14 +340,14 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
354340
355341 new_fdt = alloc_fdtable (open_files - 1 );
356342 if (!new_fdt ) {
357- * errorp = - ENOMEM ;
343+ error = - ENOMEM ;
358344 goto out_release ;
359345 }
360346
361347 /* beyond sysctl_nr_open; nothing to do */
362348 if (unlikely (new_fdt -> max_fds < open_files )) {
363349 __free_fdtable (new_fdt );
364- * errorp = - EMFILE ;
350+ error = - EMFILE ;
365351 goto out_release ;
366352 }
367353
@@ -372,7 +358,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
372358 */
373359 spin_lock (& oldf -> file_lock );
374360 old_fdt = files_fdtable (oldf );
375- open_files = sane_fdtable_size (old_fdt , max_fds );
361+ open_files = sane_fdtable_size (old_fdt , punch_hole );
376362 }
377363
378364 copy_fd_bitmaps (new_fdt , old_fdt , open_files / BITS_PER_LONG );
@@ -406,8 +392,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
406392
407393out_release :
408394 kmem_cache_free (files_cachep , newf );
409- out :
410- return NULL ;
395+ return ERR_PTR (error );
411396}
412397
413398static struct fdtable * close_files (struct files_struct * files )
@@ -748,37 +733,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
748733 if (fd > max_fd )
749734 return - EINVAL ;
750735
751- if (flags & CLOSE_RANGE_UNSHARE ) {
752- int ret ;
753- unsigned int max_unshare_fds = NR_OPEN_MAX ;
736+ if ((flags & CLOSE_RANGE_UNSHARE ) && atomic_read (& cur_fds -> count ) > 1 ) {
737+ struct fd_range range = {fd , max_fd }, * punch_hole = & range ;
754738
755739 /*
756740 * If the caller requested all fds to be made cloexec we always
757741 * copy all of the file descriptors since they still want to
758742 * use them.
759743 */
760- if (!(flags & CLOSE_RANGE_CLOEXEC )) {
761- /*
762- * If the requested range is greater than the current
763- * maximum, we're closing everything so only copy all
764- * file descriptors beneath the lowest file descriptor.
765- */
766- rcu_read_lock ();
767- if (max_fd >= last_fd (files_fdtable (cur_fds )))
768- max_unshare_fds = fd ;
769- rcu_read_unlock ();
770- }
771-
772- ret = unshare_fd (CLONE_FILES , max_unshare_fds , & fds );
773- if (ret )
774- return ret ;
744+ if (flags & CLOSE_RANGE_CLOEXEC )
745+ punch_hole = NULL ;
775746
747+ fds = dup_fd (cur_fds , punch_hole );
748+ if (IS_ERR (fds ))
749+ return PTR_ERR (fds );
776750 /*
777751 * We used to share our file descriptor table, and have now
778752 * created a private one, make sure we're using it below.
779753 */
780- if (fds )
781- swap (cur_fds , fds );
754+ swap (cur_fds , fds );
782755 }
783756
784757 if (flags & CLOSE_RANGE_CLOEXEC )
0 commit comments