Skip to content

Commit 2a06298

Browse files
Zhihao Chengbrauner
authored andcommitted
vfs: Don't evict inode under the inode lru traversing context
The inode reclaiming process(See function prune_icache_sb) collects all reclaimable inodes and mark them with I_FREEING flag at first, at that time, other processes will be stuck if they try getting these inodes (See function find_inode_fast), then the reclaiming process destroy the inodes by function dispose_list(). Some filesystems(eg. ext4 with ea_inode feature, ubifs with xattr) may do inode lookup in the inode evicting callback function, if the inode lookup is operated under the inode lru traversing context, deadlock problems may happen. Case 1: In function ext4_evict_inode(), the ea inode lookup could happen if ea_inode feature is enabled, the lookup process will be stuck under the evicting context like this: 1. File A has inode i_reg and an ea inode i_ea 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea 3. Then, following three processes running like this: PA PB echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // i_reg is added into lru, lru->i_ea->i_reg prune_icache_sb list_lru_walk_one inode_lru_isolate i_ea->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(i_reg) spin_unlock(&i_reg->i_lock) spin_unlock(lru_lock) rm file A i_reg->nlink = 0 iput(i_reg) // i_reg->nlink is 0, do evict ext4_evict_inode ext4_xattr_delete_inode ext4_xattr_inode_dec_ref_all ext4_xattr_inode_iget ext4_iget(i_ea->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(i_ea) ----→ AA deadlock dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&i_ea->i_state) Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file deleting process holds BASEHD's wbuf->io_mutex while getting the xattr inode, which could race with inode reclaiming process(The reclaiming process could try locking BASEHD's wbuf->io_mutex in inode evicting function), then an ABBA deadlock problem would happen as following: 1. File A has inode ia and a xattr(with inode ixa), regular file B has inode ib and a xattr. 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa 3. Then, following three processes running like this: PA PB PC echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // ib and ia are added into lru, lru->ixa->ib->ia prune_icache_sb list_lru_walk_one inode_lru_isolate ixa->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(ib) spin_unlock(&ib->i_lock) spin_unlock(lru_lock) rm file B ib->nlink = 0 rm file A iput(ia) ubifs_evict_inode(ia) ubifs_jnl_delete_inode(ia) ubifs_jnl_write_inode(ia) make_reservation(BASEHD) // Lock wbuf->io_mutex ubifs_iget(ixa->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(ixa) | iput(ib) // ib->nlink is 0, do evict | ubifs_evict_inode | ubifs_jnl_delete_inode(ib) ↓ ubifs_jnl_write_inode ABBA deadlock ←-----make_reservation(BASEHD) dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&ixa->i_state) Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to pin the inode in memory while inode_lru_isolate() reclaims its pages instead of using ordinary inode reference. This way inode deletion cannot be triggered from inode_lru_isolate() thus avoiding the deadlock. evict() is made to wait for I_LRU_ISOLATING to be cleared before proceeding with inode cleanup. Link: https://lore.kernel.org/all/[email protected]/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022 Fixes: e50e512 ("ext4: xattr-in-inode support") Fixes: 7959cf3 ("ubifs: journal: Handle xattrs like files") Cc: [email protected] Signed-off-by: Zhihao Cheng <[email protected]> Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Jan Kara <[email protected]> Suggested-by: Jan Kara <[email protected]> Suggested-by: Mateusz Guzik <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 7b589a9 commit 2a06298

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

fs/inode.c

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,39 @@ static void inode_lru_list_del(struct inode *inode)
488488
this_cpu_dec(nr_unused);
489489
}
490490

491+
static void inode_pin_lru_isolating(struct inode *inode)
492+
{
493+
lockdep_assert_held(&inode->i_lock);
494+
WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
495+
inode->i_state |= I_LRU_ISOLATING;
496+
}
497+
498+
static void inode_unpin_lru_isolating(struct inode *inode)
499+
{
500+
spin_lock(&inode->i_lock);
501+
WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
502+
inode->i_state &= ~I_LRU_ISOLATING;
503+
smp_mb();
504+
wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
505+
spin_unlock(&inode->i_lock);
506+
}
507+
508+
static void inode_wait_for_lru_isolating(struct inode *inode)
509+
{
510+
spin_lock(&inode->i_lock);
511+
if (inode->i_state & I_LRU_ISOLATING) {
512+
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
513+
wait_queue_head_t *wqh;
514+
515+
wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
516+
spin_unlock(&inode->i_lock);
517+
__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
518+
spin_lock(&inode->i_lock);
519+
WARN_ON(inode->i_state & I_LRU_ISOLATING);
520+
}
521+
spin_unlock(&inode->i_lock);
522+
}
523+
491524
/**
492525
* inode_sb_list_add - add inode to the superblock list of inodes
493526
* @inode: inode to add
@@ -657,6 +690,8 @@ static void evict(struct inode *inode)
657690

658691
inode_sb_list_del(inode);
659692

693+
inode_wait_for_lru_isolating(inode);
694+
660695
/*
661696
* Wait for flusher thread to be done with the inode so that filesystem
662697
* does not start destroying it while writeback is still running. Since
@@ -855,7 +890,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
855890
* be under pressure before the cache inside the highmem zone.
856891
*/
857892
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
858-
__iget(inode);
893+
inode_pin_lru_isolating(inode);
859894
spin_unlock(&inode->i_lock);
860895
spin_unlock(lru_lock);
861896
if (remove_inode_buffers(inode)) {
@@ -867,7 +902,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
867902
__count_vm_events(PGINODESTEAL, reap);
868903
mm_account_reclaimed_pages(reap);
869904
}
870-
iput(inode);
905+
inode_unpin_lru_isolating(inode);
871906
spin_lock(lru_lock);
872907
return LRU_RETRY;
873908
}

include/linux/fs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
23922392
*
23932393
* I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback.
23942394
*
2395+
* I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding
2396+
* i_count.
2397+
*
23952398
* Q: What is the difference between I_WILL_FREE and I_FREEING?
23962399
*/
23972400
#define I_DIRTY_SYNC (1 << 0)
@@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
24152418
#define I_DONTCACHE (1 << 16)
24162419
#define I_SYNC_QUEUED (1 << 17)
24172420
#define I_PINNING_NETFS_WB (1 << 18)
2421+
#define __I_LRU_ISOLATING 19
2422+
#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING)
24182423

24192424
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
24202425
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)

0 commit comments

Comments
 (0)