Skip to content

Commit d1e9a63

Browse files
committed
Merge tag 'vfs-6.11-rc1.fixes.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner: "VFS: - The new 64bit mount ids start after the old mount id, i.e., at the first non-32 bit value. However, we started counting one id too late and thus lost 4294967296 as the first valid id. Fix that. - Update a few comments on some vfs_*() creation helpers. - Move copying of the xattr name out from the locks required to start a filesystem write. - Extend the filelock lock UAF fix to the compat code as well. - Now that we added the ability to look up an inode under RCU it's possible that lockless hash lookup can find and lock an inode after it gets I_FREEING set. It then waits until inode teardown in evict() is finished. The flag however is still set after evict() has woken up all waiters. If the inode lock is taken late enough on the waiting side after hash removal and wakeup happened the waiting thread will never be woken. Before RCU based lookup this was synchronized via the inode_hash_lock. But since unhashing requires the inode lock as well we can check whether the inode is unhashed while holding inode lock even without holding inode_hash_lock. pidfd: - The nsproxy structure contains nearly all of the namespaces associated with a task. When a namespace type isn't supported nsproxy might contain a NULL pointer or always point to the initial namespace type. The logic isn't consistent. So when deriving namespace fds we need to ensure that the namespace type is supported. First, so that we don't risk dereferncing NULL pointers. The correct bigger fix would be to change all namespaces to always set a valid namespace pointer in struct nsproxy independent of whether or not it is compiled in. But that requires quite a few changes. Second, so that we don't allow deriving namespace fds when the namespace type doesn't exist and thus when they couldn't also be derived via /proc/self/ns/. - Add missing selftests for the new pidfd ioctls to derive namespace fds. This simply extends the already existing testsuite. netfs: - Fix debug logging and fix kconfig variable name so it actually works. - Fix writeback that goes both to the server and cache. The streams are only activated once a subreq is added. When a server write happens the subreq doesn't need to have finished by the time the cache write is started. If the server write has already finished by the time the cache write is about to start the cache write will operate on a folio that might already have been reused. Fix this by preactivating the cache write. - Limit cachefiles subreq size for cache writes to MAX_RW_COUNT" * tag 'vfs-6.11-rc1.fixes.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: inode: clarify what's locked vfs: Fix potential circular locking through setxattr() and removexattr() filelock: Fix fcntl/close race recovery compat path fs: use all available ids cachefiles: Set the max subreq size for cache writes to MAX_RW_COUNT netfs: Fix writeback that needs to go to both server and cache pidfs: add selftests for new namespace ioctls pidfs: handle kernels without namespaces cleanly pidfs: when time ns disabled add check for ioctl vfs: correct the comments of vfs_*() helpers vfs: handle __wait_on_freeing_inode() and evict() race netfs: Rename CONFIG_FSCACHE_DEBUG to CONFIG_NETFS_DEBUG netfs: Revert "netfs: Switch debug logging to pr_debug()"
2 parents e44be00 + f5e5e97 commit d1e9a63

24 files changed

+488
-213
lines changed

fs/cachefiles/io.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)
630630

631631
_enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start);
632632

633-
subreq->max_len = ULONG_MAX;
633+
subreq->max_len = MAX_RW_COUNT;
634634
subreq->max_nr_segs = BIO_MAX_VECS;
635635

636636
if (!cachefiles_cres_file(cres)) {

fs/inode.c

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,16 @@ static void evict(struct inode *inode)
676676

677677
remove_inode_hash(inode);
678678

679+
/*
680+
* Wake up waiters in __wait_on_freeing_inode().
681+
*
682+
* Lockless hash lookup may end up finding the inode before we removed
683+
* it above, but only lock it *after* we are done with the wakeup below.
684+
* In this case the potential waiter cannot safely block.
685+
*
686+
* The inode being unhashed after the call to remove_inode_hash() is
687+
* used as an indicator whether blocking on it is safe.
688+
*/
679689
spin_lock(&inode->i_lock);
680690
wake_up_bit(&inode->i_state, __I_NEW);
681691
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
@@ -888,18 +898,18 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
888898
return freed;
889899
}
890900

891-
static void __wait_on_freeing_inode(struct inode *inode, bool locked);
901+
static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked);
892902
/*
893903
* Called with the inode lock held.
894904
*/
895905
static struct inode *find_inode(struct super_block *sb,
896906
struct hlist_head *head,
897907
int (*test)(struct inode *, void *),
898-
void *data, bool locked)
908+
void *data, bool is_inode_hash_locked)
899909
{
900910
struct inode *inode = NULL;
901911

902-
if (locked)
912+
if (is_inode_hash_locked)
903913
lockdep_assert_held(&inode_hash_lock);
904914
else
905915
lockdep_assert_not_held(&inode_hash_lock);
@@ -913,7 +923,7 @@ static struct inode *find_inode(struct super_block *sb,
913923
continue;
914924
spin_lock(&inode->i_lock);
915925
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
916-
__wait_on_freeing_inode(inode, locked);
926+
__wait_on_freeing_inode(inode, is_inode_hash_locked);
917927
goto repeat;
918928
}
919929
if (unlikely(inode->i_state & I_CREATING)) {
@@ -936,11 +946,11 @@ static struct inode *find_inode(struct super_block *sb,
936946
*/
937947
static struct inode *find_inode_fast(struct super_block *sb,
938948
struct hlist_head *head, unsigned long ino,
939-
bool locked)
949+
bool is_inode_hash_locked)
940950
{
941951
struct inode *inode = NULL;
942952

943-
if (locked)
953+
if (is_inode_hash_locked)
944954
lockdep_assert_held(&inode_hash_lock);
945955
else
946956
lockdep_assert_not_held(&inode_hash_lock);
@@ -954,7 +964,7 @@ static struct inode *find_inode_fast(struct super_block *sb,
954964
continue;
955965
spin_lock(&inode->i_lock);
956966
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
957-
__wait_on_freeing_inode(inode, locked);
967+
__wait_on_freeing_inode(inode, is_inode_hash_locked);
958968
goto repeat;
959969
}
960970
if (unlikely(inode->i_state & I_CREATING)) {
@@ -2287,19 +2297,29 @@ EXPORT_SYMBOL(inode_needs_sync);
22872297
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
22882298
* will DTRT.
22892299
*/
2290-
static void __wait_on_freeing_inode(struct inode *inode, bool locked)
2300+
static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
22912301
{
22922302
wait_queue_head_t *wq;
22932303
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
2304+
2305+
/*
2306+
* Handle racing against evict(), see that routine for more details.
2307+
*/
2308+
if (unlikely(inode_unhashed(inode))) {
2309+
WARN_ON(is_inode_hash_locked);
2310+
spin_unlock(&inode->i_lock);
2311+
return;
2312+
}
2313+
22942314
wq = bit_waitqueue(&inode->i_state, __I_NEW);
22952315
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
22962316
spin_unlock(&inode->i_lock);
22972317
rcu_read_unlock();
2298-
if (locked)
2318+
if (is_inode_hash_locked)
22992319
spin_unlock(&inode_hash_lock);
23002320
schedule();
23012321
finish_wait(wq, &wait.wq_entry);
2302-
if (locked)
2322+
if (is_inode_hash_locked)
23032323
spin_lock(&inode_hash_lock);
23042324
rcu_read_lock();
23052325
}

fs/locks.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2570,8 +2570,9 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
25702570
error = do_lock_file_wait(filp, cmd, file_lock);
25712571

25722572
/*
2573-
* Attempt to detect a close/fcntl race and recover by releasing the
2574-
* lock that was just acquired. There is no need to do that when we're
2573+
* Detect close/fcntl races and recover by zapping all POSIX locks
2574+
* associated with this file and our files_struct, just like on
2575+
* filp_flush(). There is no need to do that when we're
25752576
* unlocking though, or for OFD locks.
25762577
*/
25772578
if (!error && file_lock->c.flc_type != F_UNLCK &&
@@ -2586,9 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
25862587
f = files_lookup_fd_locked(files, fd);
25872588
spin_unlock(&files->file_lock);
25882589
if (f != filp) {
2589-
file_lock->c.flc_type = F_UNLCK;
2590-
error = do_lock_file_wait(filp, cmd, file_lock);
2591-
WARN_ON_ONCE(error);
2590+
locks_remove_posix(filp, files);
25922591
error = -EBADF;
25932592
}
25942593
}

fs/namei.c

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3248,9 +3248,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
32483248
/**
32493249
* vfs_create - create new file
32503250
* @idmap: idmap of the mount the inode was found from
3251-
* @dir: inode of @dentry
3252-
* @dentry: pointer to dentry of the base directory
3253-
* @mode: mode of the new file
3251+
* @dir: inode of the parent directory
3252+
* @dentry: dentry of the child file
3253+
* @mode: mode of the child file
32543254
* @want_excl: whether the file must not yet exist
32553255
*
32563256
* Create a new file.
@@ -4047,9 +4047,9 @@ EXPORT_SYMBOL(user_path_create);
40474047
/**
40484048
* vfs_mknod - create device node or file
40494049
* @idmap: idmap of the mount the inode was found from
4050-
* @dir: inode of @dentry
4051-
* @dentry: pointer to dentry of the base directory
4052-
* @mode: mode of the new device node or file
4050+
* @dir: inode of the parent directory
4051+
* @dentry: dentry of the child device node
4052+
* @mode: mode of the child device node
40534053
* @dev: device number of device to create
40544054
*
40554055
* Create a device node or file.
@@ -4174,9 +4174,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
41744174
/**
41754175
* vfs_mkdir - create directory
41764176
* @idmap: idmap of the mount the inode was found from
4177-
* @dir: inode of @dentry
4178-
* @dentry: pointer to dentry of the base directory
4179-
* @mode: mode of the new directory
4177+
* @dir: inode of the parent directory
4178+
* @dentry: dentry of the child directory
4179+
* @mode: mode of the child directory
41804180
*
41814181
* Create a directory.
41824182
*
@@ -4256,8 +4256,8 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
42564256
/**
42574257
* vfs_rmdir - remove directory
42584258
* @idmap: idmap of the mount the inode was found from
4259-
* @dir: inode of @dentry
4260-
* @dentry: pointer to dentry of the base directory
4259+
* @dir: inode of the parent directory
4260+
* @dentry: dentry of the child directory
42614261
*
42624262
* Remove a directory.
42634263
*
@@ -4537,8 +4537,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
45374537
/**
45384538
* vfs_symlink - create symlink
45394539
* @idmap: idmap of the mount the inode was found from
4540-
* @dir: inode of @dentry
4541-
* @dentry: pointer to dentry of the base directory
4540+
* @dir: inode of the parent directory
4541+
* @dentry: dentry of the child symlink file
45424542
* @oldname: name of the file to link to
45434543
*
45444544
* Create a symlink.

fs/namespace.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ static DEFINE_IDA(mnt_id_ida);
7070
static DEFINE_IDA(mnt_group_ida);
7171

7272
/* Don't allow confusion with old 32bit mount ID */
73-
#define MNT_UNIQUE_ID_OFFSET (1ULL << 32)
73+
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
7474
static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
7575

7676
static struct hlist_head *mount_hashtable __ro_after_init;

fs/netfs/Kconfig

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ config NETFS_STATS
2222
between CPUs. On the other hand, the stats are very useful for
2323
debugging purposes. Saying 'Y' here is recommended.
2424

25+
config NETFS_DEBUG
26+
bool "Enable dynamic debugging netfslib and FS-Cache"
27+
depends on NETFS
28+
help
29+
This permits debugging to be dynamically enabled in the local caching
30+
management module. If this is set, the debugging output may be
31+
enabled by setting bits in /sys/module/netfs/parameters/debug.
32+
2533
config FSCACHE
2634
bool "General filesystem local caching manager"
2735
depends on NETFS_SUPPORT
@@ -50,13 +58,3 @@ config FSCACHE_STATS
5058
debugging purposes. Saying 'Y' here is recommended.
5159

5260
See Documentation/filesystems/caching/fscache.rst for more information.
53-
54-
config FSCACHE_DEBUG
55-
bool "Debug FS-Cache"
56-
depends on FSCACHE
57-
help
58-
This permits debugging to be dynamically enabled in the local caching
59-
management module. If this is set, the debugging output may be
60-
enabled by setting bits in /sys/modules/fscache/parameter/debug.
61-
62-
See Documentation/filesystems/caching/fscache.rst for more information.

fs/netfs/buffered_read.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
117117
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
118118
if (folio->index == rreq->no_unlock_folio &&
119119
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
120-
kdebug("no unlock");
120+
_debug("no unlock");
121121
else
122122
folio_unlock(folio);
123123
}
@@ -204,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl)
204204
struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
205205
int ret;
206206

207-
kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
207+
_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
208208

209209
if (readahead_count(ractl) == 0)
210210
return;
@@ -268,7 +268,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
268268
struct folio *sink = NULL;
269269
int ret;
270270

271-
kenter("%lx", folio->index);
271+
_enter("%lx", folio->index);
272272

273273
rreq = netfs_alloc_request(mapping, file,
274274
folio_pos(folio), folio_size(folio),
@@ -508,7 +508,7 @@ int netfs_write_begin(struct netfs_inode *ctx,
508508

509509
have_folio:
510510
*_folio = folio;
511-
kleave(" = 0");
511+
_leave(" = 0");
512512
return 0;
513513

514514
error_put:
@@ -518,7 +518,7 @@ int netfs_write_begin(struct netfs_inode *ctx,
518518
folio_unlock(folio);
519519
folio_put(folio);
520520
}
521-
kleave(" = %d", ret);
521+
_leave(" = %d", ret);
522522
return ret;
523523
}
524524
EXPORT_SYMBOL(netfs_write_begin);
@@ -536,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
536536
size_t flen = folio_size(folio);
537537
int ret;
538538

539-
kenter("%zx @%llx", flen, start);
539+
_enter("%zx @%llx", flen, start);
540540

541541
ret = -ENOMEM;
542542

@@ -567,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
567567
error_put:
568568
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
569569
error:
570-
kleave(" = %d", ret);
570+
_leave(" = %d", ret);
571571
return ret;
572572
}
573573

fs/netfs/buffered_write.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
5656
struct netfs_group *group = netfs_folio_group(folio);
5757
loff_t pos = folio_pos(folio);
5858

59-
kenter("");
59+
_enter("");
6060

6161
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
6262
return NETFS_FLUSH_CONTENT;
@@ -272,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
272272
*/
273273
howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
274274
flen, offset, part, maybe_trouble);
275-
kdebug("howto %u", howto);
275+
_debug("howto %u", howto);
276276
switch (howto) {
277277
case NETFS_JUST_PREFETCH:
278278
ret = netfs_prefetch_for_write(file, folio, offset, part);
279279
if (ret < 0) {
280-
kdebug("prefetch = %zd", ret);
280+
_debug("prefetch = %zd", ret);
281281
goto error_folio_unlock;
282282
}
283283
break;
@@ -418,7 +418,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
418418
}
419419

420420
iocb->ki_pos += written;
421-
kleave(" = %zd [%zd]", written, ret);
421+
_leave(" = %zd [%zd]", written, ret);
422422
return written ? written : ret;
423423

424424
error_folio_unlock:
@@ -491,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
491491
struct netfs_inode *ictx = netfs_inode(inode);
492492
ssize_t ret;
493493

494-
kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
494+
_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
495495

496496
if (!iov_iter_count(from))
497497
return 0;
@@ -529,7 +529,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
529529
vm_fault_t ret = VM_FAULT_RETRY;
530530
int err;
531531

532-
kenter("%lx", folio->index);
532+
_enter("%lx", folio->index);
533533

534534
sb_start_pagefault(inode->i_sb);
535535

fs/netfs/direct_read.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
3333
size_t orig_count = iov_iter_count(iter);
3434
bool async = !is_sync_kiocb(iocb);
3535

36-
kenter("");
36+
_enter("");
3737

3838
if (!orig_count)
3939
return 0; /* Don't update atime */

fs/netfs/direct_write.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,15 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
3737
size_t len = iov_iter_count(iter);
3838
bool async = !is_sync_kiocb(iocb);
3939

40-
kenter("");
40+
_enter("");
4141

4242
/* We're going to need a bounce buffer if what we transmit is going to
4343
* be different in some way to the source buffer, e.g. because it gets
4444
* encrypted/compressed or because it needs expanding to a block size.
4545
*/
4646
// TODO
4747

48-
kdebug("uw %llx-%llx", start, end);
48+
_debug("uw %llx-%llx", start, end);
4949

5050
wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
5151
iocb->ki_flags & IOCB_DIRECT ?
@@ -96,7 +96,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
9696
wreq->cleanup = netfs_cleanup_dio_write;
9797
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
9898
if (ret < 0) {
99-
kdebug("begin = %zd", ret);
99+
_debug("begin = %zd", ret);
100100
goto out;
101101
}
102102

@@ -143,7 +143,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
143143
loff_t pos = iocb->ki_pos;
144144
unsigned long long end = pos + iov_iter_count(from) - 1;
145145

146-
kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
146+
_enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
147147

148148
if (!iov_iter_count(from))
149149
return 0;

0 commit comments

Comments
 (0)