Skip to content

Commit 288b2d4

Browse files
committed
Fix inode eviction and sync writeback deadlock on Linux
If inode eviction and sync writeback happen on the same inode at the same time, inode eviction will set I_FREEING and wait for sync writeback, and sync writeback may eventually calls zfs_get_data and loop in zfs_zget forever because igrab cannot succeed with I_FREEING, thus causing deadlock. To fix this, we take a hold on znode in itx. This make sure the inode won't get evicted until sync writeback is done. Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Fixes openzfs#7964 Fixes openzfs#9430
1 parent 1178796 commit 288b2d4

File tree

9 files changed

+71
-80
lines changed

9 files changed

+71
-80
lines changed

cmd/ztest.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2487,7 +2487,7 @@ ztest_get_done(zgd_t *zgd, int error)
24872487
}
24882488

24892489
static int
2490-
ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
2490+
ztest_get_data(void *arg, void *arg2, lr_write_t *lr, char *buf,
24912491
struct lwb *lwb, zio_t *zio)
24922492
{
24932493
(void) arg2;

include/sys/zil.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ typedef enum {
457457
} itx_wr_state_t;
458458

459459
typedef void (*zil_callback_t)(void *data, int err);
460+
typedef void (*itx_zrele_t)(void *);
460461

461462
typedef struct itx {
462463
list_node_t itx_node; /* linkage on zl_itx_list */
@@ -467,7 +468,8 @@ typedef struct itx {
467468
void *itx_callback_data; /* User data for the callback */
468469
size_t itx_size; /* allocated itx structure size */
469470
uint64_t itx_oid; /* object id */
470-
uint64_t itx_gen; /* gen number for zfs_get_data */
471+
void *itx_znode; /* znode for zfs_get_data */
472+
itx_zrele_t itx_zrele; /* cleanup for itx_znode */
471473
lr_t itx_lr; /* common part of log record */
472474
uint8_t itx_lr_data[]; /* type-specific part of lr_xx_t */
473475
} itx_t;
@@ -605,7 +607,7 @@ typedef int zil_parse_blk_func_t(zilog_t *zilog, const blkptr_t *bp, void *arg,
605607
typedef int zil_parse_lr_func_t(zilog_t *zilog, const lr_t *lr, void *arg,
606608
uint64_t txg);
607609
typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap);
608-
typedef int zil_get_data_t(void *arg, uint64_t arg2, lr_write_t *lr, char *dbuf,
610+
typedef int zil_get_data_t(void *arg, void *arg2, lr_write_t *lr, char *dbuf,
609611
struct lwb *lwb, zio_t *zio);
610612

611613
extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,

include/sys/zvol_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
115115
uint64_t len);
116116
void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
117117
uint64_t size, boolean_t commit);
118-
int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
118+
int zvol_get_data(void *arg, void *arg2, lr_write_t *lr, char *buf,
119119
struct lwb *lwb, zio_t *zio);
120120
int zvol_init_impl(void);
121121
void zvol_fini_impl(void);

module/os/freebsd/zfs/zfs_vfsops.c

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1598,6 +1598,29 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
15981598
znode_t *zp;
15991599
dsl_dir_t *dd;
16001600

1601+
ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1602+
1603+
if (!unmounting) {
1604+
/*
1605+
* We purge the parent filesystem's vfsp as the parent
1606+
* filesystem and all of its snapshots have their vnode's
1607+
* v_vfsp set to the parent's filesystem's vfsp. Note,
1608+
* 'z_parent' is self referential for non-snapshots.
1609+
*/
1610+
#ifdef FREEBSD_NAMECACHE
1611+
cache_purgevfs(zfsvfs->z_parent->z_vfs);
1612+
#endif
1613+
}
1614+
1615+
/*
1616+
* Close the zil. NB: Can't close the zil while zfs_inactive
1617+
* threads are blocked as zil_close can call zfs_inactive.
1618+
*/
1619+
if (zfsvfs->z_log) {
1620+
zil_close(zfsvfs->z_log);
1621+
zfsvfs->z_log = NULL;
1622+
}
1623+
16011624
/*
16021625
* If someone has not already unmounted this file system,
16031626
* drain the zrele_taskq to ensure all active references to the
@@ -1624,28 +1647,6 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
16241647
break;
16251648
}
16261649
}
1627-
ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1628-
1629-
if (!unmounting) {
1630-
/*
1631-
* We purge the parent filesystem's vfsp as the parent
1632-
* filesystem and all of its snapshots have their vnode's
1633-
* v_vfsp set to the parent's filesystem's vfsp. Note,
1634-
* 'z_parent' is self referential for non-snapshots.
1635-
*/
1636-
#ifdef FREEBSD_NAMECACHE
1637-
cache_purgevfs(zfsvfs->z_parent->z_vfs);
1638-
#endif
1639-
}
1640-
1641-
/*
1642-
* Close the zil. NB: Can't close the zil while zfs_inactive
1643-
* threads are blocked as zil_close can call zfs_inactive.
1644-
*/
1645-
if (zfsvfs->z_log) {
1646-
zil_close(zfsvfs->z_log);
1647-
zfsvfs->z_log = NULL;
1648-
}
16491650

16501651
ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
16511652

module/os/linux/zfs/zfs_vfsops.c

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1349,6 +1349,28 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
13491349

13501350
zfs_unlinked_drain_stop_wait(zfsvfs);
13511351

1352+
ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1353+
1354+
if (!unmounting) {
1355+
/*
1356+
* We purge the parent filesystem's super block as the
1357+
* parent filesystem and all of its snapshots have their
1358+
* inode's super block set to the parent's filesystem's
1359+
* super block. Note, 'z_parent' is self referential
1360+
* for non-snapshots.
1361+
*/
1362+
shrink_dcache_sb(zfsvfs->z_parent->z_sb);
1363+
}
1364+
1365+
/*
1366+
* Close the zil. NB: Can't close the zil while zfs_inactive
1367+
* threads are blocked as zil_close can call zfs_inactive.
1368+
*/
1369+
if (zfsvfs->z_log) {
1370+
zil_close(zfsvfs->z_log);
1371+
zfsvfs->z_log = NULL;
1372+
}
1373+
13521374
/*
13531375
* If someone has not already unmounted this file system,
13541376
* drain the zrele_taskq to ensure all active references to the
@@ -1376,28 +1398,6 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
13761398
}
13771399
}
13781400

1379-
ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1380-
1381-
if (!unmounting) {
1382-
/*
1383-
* We purge the parent filesystem's super block as the
1384-
* parent filesystem and all of its snapshots have their
1385-
* inode's super block set to the parent's filesystem's
1386-
* super block. Note, 'z_parent' is self referential
1387-
* for non-snapshots.
1388-
*/
1389-
shrink_dcache_sb(zfsvfs->z_parent->z_sb);
1390-
}
1391-
1392-
/*
1393-
* Close the zil. NB: Can't close the zil while zfs_inactive
1394-
* threads are blocked as zil_close can call zfs_inactive.
1395-
*/
1396-
if (zfsvfs->z_log) {
1397-
zil_close(zfsvfs->z_log);
1398-
zfsvfs->z_log = NULL;
1399-
}
1400-
14011401
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
14021402

14031403
/*

module/zfs/zfs_log.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <sys/spa.h>
4848
#include <sys/zfs_fuid.h>
4949
#include <sys/dsl_dataset.h>
50+
#include <sys/zfs_vnops.h>
5051

5152
/*
5253
* These zfs_log_* functions must be called within a dmu tx, in one
@@ -615,7 +616,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
615616
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
616617
uint32_t blocksize = zp->z_blksz;
617618
itx_wr_state_t write_state;
618-
uint64_t gen = 0, log_size = 0;
619+
uint64_t log_size = 0;
619620

620621
if (zil_replaying(zilog, tx) || zp->z_unlinked ||
621622
zfs_xattr_owner_unlinked(zp)) {
@@ -627,9 +628,6 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
627628
write_state = zil_write_state(zilog, resid, blocksize, o_direct,
628629
commit);
629630

630-
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
631-
sizeof (gen));
632-
633631
while (resid) {
634632
itx_t *itx;
635633
lr_write_t *lr;
@@ -683,7 +681,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
683681

684682
itx->itx_private = ZTOZSB(zp);
685683
itx->itx_sync = (zp->z_sync_cnt != 0);
686-
itx->itx_gen = gen;
684+
zhold(zp);
685+
itx->itx_znode = zp;
686+
itx->itx_zrele = (itx_zrele_t)zfs_zrele_async;
687687

688688
if (resid == len) {
689689
itx->itx_callback = callback;

module/zfs/zfs_vnops.c

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,45 +1326,27 @@ static void zfs_get_done(zgd_t *zgd, int error);
13261326
* Get data to generate a TX_WRITE intent log record.
13271327
*/
13281328
int
1329-
zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
1329+
zfs_get_data(void *arg, void *arg2, lr_write_t *lr, char *buf,
13301330
struct lwb *lwb, zio_t *zio)
13311331
{
13321332
zfsvfs_t *zfsvfs = arg;
13331333
objset_t *os = zfsvfs->z_os;
1334-
znode_t *zp;
1334+
znode_t *zp = arg2;
13351335
uint64_t object = lr->lr_foid;
13361336
uint64_t offset = lr->lr_offset;
13371337
uint64_t size = lr->lr_length;
13381338
zgd_t *zgd;
13391339
int error = 0;
1340-
uint64_t zp_gen;
13411340

1341+
ASSERT3P(zp, !=, NULL);
13421342
ASSERT3P(lwb, !=, NULL);
13431343
ASSERT3U(size, !=, 0);
13441344

1345-
/*
1346-
* Nothing to do if the file has been removed
1347-
*/
1348-
if (zfs_zget(zfsvfs, object, &zp) != 0)
1349-
return (SET_ERROR(ENOENT));
1350-
if (zp->z_unlinked) {
1351-
/*
1352-
* Release the vnode asynchronously as we currently have the
1353-
* txg stopped from syncing.
1354-
*/
1355-
zfs_zrele_async(zp);
1345+
if (zp->z_unlinked)
13561346
return (SET_ERROR(ENOENT));
1357-
}
1358-
/* check if generation number matches */
1359-
if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1360-
sizeof (zp_gen)) != 0) {
1361-
zfs_zrele_async(zp);
1362-
return (SET_ERROR(EIO));
1363-
}
1364-
if (zp_gen != gen) {
1365-
zfs_zrele_async(zp);
1366-
return (SET_ERROR(ENOENT));
1367-
}
1347+
1348+
/* Hold zp ref for ourselves */
1349+
zhold(zp);
13681350

13691351
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
13701352
zgd->zgd_lwb = lwb;

module/zfs/zil.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2457,7 +2457,7 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
24572457
* writing completion before the vdev cache flushing.
24582458
*/
24592459
error = zilog->zl_get_data(itx->itx_private,
2460-
itx->itx_gen, lrwb, dbuf, lwb,
2460+
itx->itx_znode, lrwb, dbuf, lwb,
24612461
lwb->lwb_child_zio);
24622462
if (dbuf != NULL && error == 0) {
24632463
/* Zero any padding bytes in the last block. */
@@ -2565,6 +2565,7 @@ zil_itx_clone(itx_t *oitx)
25652565
memcpy(itx, oitx, oitx->itx_size);
25662566
itx->itx_callback = NULL;
25672567
itx->itx_callback_data = NULL;
2568+
itx->itx_zrele = NULL;
25682569
return (itx);
25692570
}
25702571

@@ -2580,6 +2581,11 @@ zil_itx_destroy(itx_t *itx, int err)
25802581
if (itx->itx_callback != NULL)
25812582
itx->itx_callback(itx->itx_callback_data, err);
25822583

2584+
if (itx->itx_zrele) {
2585+
ASSERT3P(itx->itx_znode, !=, NULL);
2586+
itx->itx_zrele(itx->itx_znode);
2587+
}
2588+
25832589
zio_data_buf_free(itx, itx->itx_size);
25842590
}
25852591

module/zfs/zvol.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -972,7 +972,7 @@ zvol_get_done(zgd_t *zgd, int error)
972972
* Get data to generate a TX_WRITE intent log record.
973973
*/
974974
int
975-
zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
975+
zvol_get_data(void *arg, void *arg2, lr_write_t *lr, char *buf,
976976
struct lwb *lwb, zio_t *zio)
977977
{
978978
zvol_state_t *zv = arg;

0 commit comments

Comments
 (0)