Skip to content

Commit dcd7306

Browse files
robnbehlendorf
authored andcommitted
zvol_remove_minors_impl: remove all async fallbacks
Since both ZFS- and OS-sides of a zvol now take care of their own locking and don't get in each other's way, there's no need for the very complicated removal code to fall back to async tasks if the locks needed at each stage can't be obtained right now. Here we change it to be a linear three-step process: select zvols of interest and flag them for removal, then wait for them to shed activity and then remove them, and finally, free them. Sponsored-by: Klara, Inc. Sponsored-by: Railway Corporation Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Fedor Uporov <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes #17625
1 parent 8a0e5e8 commit dcd7306

File tree

2 files changed

+84
-102
lines changed

2 files changed

+84
-102
lines changed

include/sys/zvol_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ typedef struct zvol_state {
5656
atomic_t zv_suspend_ref; /* refcount for suspend */
5757
krwlock_t zv_suspend_lock; /* suspend lock */
5858
kcondvar_t zv_removing_cv; /* ready to remove minor */
59+
list_node_t zv_remove_node; /* node on removal list */
5960
struct zvol_state_os *zv_zso; /* private platform state */
6061
boolean_t zv_threading; /* volthreading property */
6162
} zvol_state_t;

module/zfs/zvol.c

Lines changed: 83 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1579,51 +1579,6 @@ zvol_create_minors_impl(zvol_task_t *task)
15791579
zvol_task_update_status(task, total, done, last_error);
15801580
}
15811581

1582-
/*
1583-
* Remove minors for specified dataset including children and snapshots.
1584-
*/
1585-
1586-
/*
1587-
* Remove the minor for a given zvol. This will do it all:
1588-
* - flag the zvol for removal, so new requests are rejected
1589-
* - wait until outstanding requests are completed
1590-
* - remove it from lists
1591-
* - free it
1592-
* It's also usable as a taskq task, and smells nice too.
1593-
*/
1594-
static void
1595-
zvol_remove_minor_task(void *arg)
1596-
{
1597-
zvol_state_t *zv = (zvol_state_t *)arg;
1598-
1599-
ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
1600-
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1601-
1602-
mutex_enter(&zv->zv_state_lock);
1603-
while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
1604-
zv->zv_flags |= ZVOL_REMOVING;
1605-
cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
1606-
}
1607-
mutex_exit(&zv->zv_state_lock);
1608-
1609-
rw_enter(&zvol_state_lock, RW_WRITER);
1610-
mutex_enter(&zv->zv_state_lock);
1611-
1612-
zvol_os_remove_minor(zv);
1613-
zvol_remove(zv);
1614-
1615-
mutex_exit(&zv->zv_state_lock);
1616-
rw_exit(&zvol_state_lock);
1617-
1618-
zvol_os_free(zv);
1619-
}
1620-
1621-
static void
1622-
zvol_free_task(void *arg)
1623-
{
1624-
zvol_os_free(arg);
1625-
}
1626-
16271582
/*
16281583
* Remove minors for specified dataset and, optionally, its children and
16291584
* snapshots.
@@ -1636,25 +1591,33 @@ zvol_remove_minors_impl(zvol_task_t *task)
16361591
int namelen = ((name) ? strlen(name) : 0);
16371592
boolean_t children = task ? !!task->zt_value : B_TRUE;
16381593

1639-
taskqid_t t;
1640-
list_t delay_list, free_list;
1641-
16421594
if (zvol_inhibit_dev)
16431595
return;
16441596

1645-
list_create(&delay_list, sizeof (zvol_state_t),
1646-
offsetof(zvol_state_t, zv_next));
1647-
list_create(&free_list, sizeof (zvol_state_t),
1648-
offsetof(zvol_state_t, zv_next));
1649-
1650-
int error = ENOENT;
1597+
/*
1598+
* We collect up zvols that we want to remove on a separate list, so
1599+
* that we don't have to hold zvol_state_lock for the whole time.
1600+
*
1601+
* We can't remove them from the global lists until we're completely
1602+
* done with them, because that would make them appear to ZFS-side ops
1603+
* that they don't exist, and the name might be reused, which can't be
1604+
* good.
1605+
*/
1606+
list_t remove_list;
1607+
list_create(&remove_list, sizeof (zvol_state_t),
1608+
offsetof(zvol_state_t, zv_remove_node));
16511609

1652-
rw_enter(&zvol_state_lock, RW_WRITER);
1610+
rw_enter(&zvol_state_lock, RW_READER);
16531611

16541612
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
16551613
zv_next = list_next(&zvol_state_list, zv);
16561614

16571615
mutex_enter(&zv->zv_state_lock);
1616+
if (zv->zv_flags & ZVOL_REMOVING) {
1617+
/* Another thread is handling shutdown, skip it. */
1618+
mutex_exit(&zv->zv_state_lock);
1619+
continue;
1620+
}
16581621

16591622
/*
16601623
* This zvol should be removed if:
@@ -1669,61 +1632,87 @@ zvol_remove_minors_impl(zvol_task_t *task)
16691632
(children && strncmp(zv->zv_name, name, namelen) == 0 &&
16701633
(zv->zv_name[namelen] == '/' ||
16711634
zv->zv_name[namelen] == '@'))) {
1672-
/*
1673-
* By holding zv_state_lock here, we guarantee that no
1674-
* one is currently using this zv
1675-
*/
1676-
error = 0;
16771635

16781636
/*
1679-
* If in use, try to throw everyone off and try again
1680-
* later.
1637+
* Matched, so mark it removal. We want to take the
1638+
* write half of the suspend lock to make sure that
1639+
* the zvol is not suspended, and give any data ops
1640+
* chance to finish.
16811641
*/
1682-
zv->zv_flags |= ZVOL_REMOVING;
1683-
if (zv->zv_open_count > 0 ||
1684-
atomic_read(&zv->zv_suspend_ref)) {
1685-
t = taskq_dispatch(
1686-
zv->zv_objset->os_spa->spa_zvol_taskq,
1687-
zvol_remove_minor_task, zv, TQ_SLEEP);
1688-
if (t == TASKQID_INVALID) {
1689-
/*
1690-
* Couldn't create the task, so we'll
1691-
* do it in place once the loop is
1692-
* finished.
1693-
*/
1694-
list_insert_head(&delay_list, zv);
1695-
}
1642+
mutex_exit(&zv->zv_state_lock);
1643+
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1644+
mutex_enter(&zv->zv_state_lock);
1645+
1646+
if (zv->zv_flags & ZVOL_REMOVING) {
1647+
/* Another thread has taken it, let them. */
16961648
mutex_exit(&zv->zv_state_lock);
1649+
rw_exit(&zv->zv_suspend_lock);
16971650
continue;
16981651
}
16991652

1700-
zvol_os_remove_minor(zv);
1701-
zvol_remove(zv);
1702-
1703-
/* Drop zv_state_lock before zvol_free() */
1653+
/*
1654+
* Mark it and unlock. New entries will see the flag
1655+
* and return ENXIO.
1656+
*/
1657+
zv->zv_flags |= ZVOL_REMOVING;
17041658
mutex_exit(&zv->zv_state_lock);
1659+
rw_exit(&zv->zv_suspend_lock);
17051660

1706-
/* Try parallel zv_free, if failed do it in place */
1707-
t = taskq_dispatch(system_taskq, zvol_free_task, zv,
1708-
TQ_SLEEP);
1709-
if (t == TASKQID_INVALID)
1710-
list_insert_head(&free_list, zv);
1711-
} else {
1661+
/* Put it on the list for the next stage. */
1662+
list_insert_head(&remove_list, zv);
1663+
} else
17121664
mutex_exit(&zv->zv_state_lock);
1713-
}
17141665
}
1666+
17151667
rw_exit(&zvol_state_lock);
17161668

1717-
/* Wait for zvols that we couldn't create a remove task for */
1718-
while ((zv = list_remove_head(&delay_list)) != NULL)
1719-
zvol_remove_minor_task(zv);
1669+
/* Didn't match any, nothing to do! */
1670+
if (list_is_empty(&remove_list)) {
1671+
if (task)
1672+
task->zt_error = SET_ERROR(ENOENT);
1673+
return;
1674+
}
1675+
1676+
/* Actually shut them all down. */
1677+
for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
1678+
zv_next = list_next(&remove_list, zv);
17201679

1721-
/* Free any that we couldn't free in parallel earlier */
1722-
while ((zv = list_remove_head(&free_list)) != NULL)
1680+
mutex_enter(&zv->zv_state_lock);
1681+
1682+
/*
1683+
* Still open or suspended, just wait. This can happen if, for
1684+
* example, we managed to acquire zv_state_lock in the moments
1685+
* where zvol_open() or zvol_release() are trading locks to
1686+
* call zvol_first_open() or zvol_last_close().
1687+
*/
1688+
while (zv->zv_open_count > 0 ||
1689+
atomic_read(&zv->zv_suspend_ref))
1690+
cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
1691+
1692+
/*
1693+
* No users, shut down the OS side. This may not remove the
1694+
* minor from view immediately, depending on the kernel
1695+
* specifics, but it will ensure that it is unusable and that
1696+
* this zvol_state_t can never again be reached from an OS-side
1697+
* operation.
1698+
*/
1699+
zvol_os_remove_minor(zv);
1700+
mutex_exit(&zv->zv_state_lock);
1701+
1702+
/* Remove it from the name lookup lists */
1703+
rw_enter(&zvol_state_lock, RW_WRITER);
1704+
zvol_remove(zv);
1705+
rw_exit(&zvol_state_lock);
1706+
}
1707+
1708+
/*
1709+
* Our own references on remove_list is the last one, free them and
1710+
* we're done.
1711+
*/
1712+
while ((zv = list_remove_head(&remove_list)) != NULL)
17231713
zvol_os_free(zv);
17241714

1725-
if (error && task)
1726-
task->zt_error = SET_ERROR(error);
1715+
list_destroy(&remove_list);
17271716
}
17281717

17291718
/* Remove minor for this specific volume only */
@@ -2182,14 +2171,6 @@ zvol_fini_impl(void)
21822171

21832172
zvol_remove_minors_impl(NULL);
21842173

2185-
/*
2186-
* The call to "zvol_remove_minors_impl" may dispatch entries to
2187-
* the system_taskq, but it doesn't wait for those entries to
2188-
* complete before it returns. Thus, we must wait for all of the
2189-
* removals to finish, before we can continue.
2190-
*/
2191-
taskq_wait_outstanding(system_taskq, 0);
2192-
21932174
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
21942175
list_destroy(&zvol_state_list);
21952176
rw_destroy(&zvol_state_lock);

0 commit comments

Comments
 (0)