Skip to content

Commit 96f9d27

Browse files
robnbehlendorf
authored andcommitted
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the time del_gendisk() and put_disk() are called, the device node may still be have an active hold, from a userspace program or something inside the kernel (a partition probe). As it is currently, this can lead to calls to zvol_open() or zvol_release() while the zvol_state_t is partially or fully freed. zvol_open() has some protection against this by checking that private_data is NULL, but zvol_release does not. This implements a better ordering for all of this by adding a new OS-side method, zvol_os_remove_minor(), which is responsible for fully decoupling the "private" (OS-side) objects from the zvol_state_t. For Linux, that means calling put_disk(), nulling private_data, and freeing zv_zso. This takes the place of zvol_os_clear_private(), which was a nod in that direction but did not do enough, and did not do it early enough. Equivalent changes are made on the FreeBSD side to follow the API change. Sponsored-by: Klara, Inc. Sponsored-by: Railway Corporation Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Fedor Uporov <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes #17625
1 parent b2c7927 commit 96f9d27

File tree

4 files changed

+99
-96
lines changed

4 files changed

+99
-96
lines changed

include/sys/zvol_impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
* CDDL HEADER END
2121
*/
2222
/*
23-
* Copyright (c) 2024, Klara, Inc.
23+
* Copyright (c) 2024, 2025, Klara, Inc.
2424
*/
2525

2626
#ifndef _SYS_ZVOL_IMPL_H
@@ -135,7 +135,7 @@ int zvol_os_rename_minor(zvol_state_t *zv, const char *newname);
135135
int zvol_os_create_minor(const char *name);
136136
int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize);
137137
boolean_t zvol_os_is_zvol(const char *path);
138-
void zvol_os_clear_private(zvol_state_t *zv);
138+
void zvol_os_remove_minor(zvol_state_t *zv);
139139
void zvol_os_set_disk_ro(zvol_state_t *zv, int flags);
140140
void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity);
141141

module/os/freebsd/zfs/zvol_os.c

Lines changed: 39 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
3232
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
3333
* Copyright (c) 2014 Integros [integros.com]
34-
* Copyright (c) 2024, Klara, Inc.
34+
* Copyright (c) 2024, 2025, Klara, Inc.
3535
*/
3636

3737
/* Portions Copyright 2011 Martin Matuska <[email protected]> */
@@ -196,7 +196,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
196196

197197
static int zvol_geom_open(struct g_provider *pp, int flag, int count);
198198
static int zvol_geom_close(struct g_provider *pp, int flag, int count);
199-
static void zvol_geom_destroy(zvol_state_t *zv);
200199
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
201200
static void zvol_geom_bio_start(struct bio *bp);
202201
static int zvol_geom_bio_getattr(struct bio *bp);
@@ -408,20 +407,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
408407
return (0);
409408
}
410409

411-
static void
412-
zvol_geom_destroy(zvol_state_t *zv)
413-
{
414-
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
415-
struct g_provider *pp = zsg->zsg_provider;
416-
417-
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
418-
419-
g_topology_assert();
420-
421-
zsg->zsg_provider = NULL;
422-
g_wither_geom(pp->geom, ENXIO);
423-
}
424-
425410
void
426411
zvol_wait_close(zvol_state_t *zv)
427412
{
@@ -1400,42 +1385,65 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
14001385
* Remove minor node for the specified volume.
14011386
*/
14021387
void
1403-
zvol_os_free(zvol_state_t *zv)
1388+
zvol_os_remove_minor(zvol_state_t *zv)
14041389
{
1405-
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1406-
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1390+
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
14071391
ASSERT0(zv->zv_open_count);
1392+
ASSERT0(atomic_read(&zv->zv_suspend_ref));
1393+
ASSERT(zv->zv_flags & ZVOL_REMOVING);
14081394

1409-
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1410-
1411-
rw_destroy(&zv->zv_suspend_lock);
1412-
zfs_rangelock_fini(&zv->zv_rangelock);
1395+
struct zvol_state_os *zso = zv->zv_zso;
1396+
zv->zv_zso = NULL;
14131397

14141398
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1415-
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1416-
struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1417-
1418-
ASSERT0P(pp->private);
1399+
struct zvol_state_geom *zsg = &zso->zso_geom;
1400+
struct g_provider *pp = zsg->zsg_provider;
1401+
pp->private = NULL;
1402+
mutex_exit(&zv->zv_state_lock);
14191403

14201404
g_topology_lock();
1421-
zvol_geom_destroy(zv);
1405+
g_wither_geom(pp->geom, ENXIO);
14221406
g_topology_unlock();
14231407
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1424-
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1408+
struct zvol_state_dev *zsd = &zso->zso_dev;
14251409
struct cdev *dev = zsd->zsd_cdev;
14261410

1411+
if (dev != NULL)
1412+
dev->si_drv2 = NULL;
1413+
mutex_exit(&zv->zv_state_lock);
1414+
14271415
if (dev != NULL) {
1428-
ASSERT0P(dev->si_drv2);
14291416
destroy_dev(dev);
14301417
knlist_clear(&zsd->zsd_selinfo.si_note, 0);
14311418
knlist_destroy(&zsd->zsd_selinfo.si_note);
14321419
}
14331420
}
14341421

1422+
kmem_free(zso, sizeof (struct zvol_state_os));
1423+
1424+
mutex_enter(&zv->zv_state_lock);
1425+
}
1426+
1427+
void
1428+
zvol_os_free(zvol_state_t *zv)
1429+
{
1430+
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1431+
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1432+
ASSERT0(zv->zv_open_count);
1433+
ASSERT0P(zv->zv_zso);
1434+
1435+
ASSERT0P(zv->zv_objset);
1436+
ASSERT0P(zv->zv_zilog);
1437+
ASSERT0P(zv->zv_dn);
1438+
1439+
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1440+
1441+
rw_destroy(&zv->zv_suspend_lock);
1442+
zfs_rangelock_fini(&zv->zv_rangelock);
1443+
14351444
mutex_destroy(&zv->zv_state_lock);
14361445
cv_destroy(&zv->zv_removing_cv);
14371446
dataset_kstats_destroy(&zv->zv_kstat);
1438-
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
14391447
kmem_free(zv, sizeof (zvol_state_t));
14401448
zvol_minors--;
14411449
}
@@ -1538,28 +1546,6 @@ zvol_os_create_minor(const char *name)
15381546
return (error);
15391547
}
15401548

1541-
void
1542-
zvol_os_clear_private(zvol_state_t *zv)
1543-
{
1544-
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1545-
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1546-
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1547-
struct g_provider *pp = zsg->zsg_provider;
1548-
1549-
if (pp->private == NULL) /* already cleared */
1550-
return;
1551-
1552-
pp->private = NULL;
1553-
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1554-
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1555-
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1556-
struct cdev *dev = zsd->zsd_cdev;
1557-
1558-
if (dev != NULL)
1559-
dev->si_drv2 = NULL;
1560-
}
1561-
}
1562-
15631549
int
15641550
zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
15651551
{

module/os/linux/zfs/zvol_os.c

Lines changed: 54 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
/*
2323
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
2424
* Copyright (c) 2024, Rob Norris <[email protected]>
25-
* Copyright (c) 2024, Klara, Inc.
25+
* Copyright (c) 2024, 2025, Klara, Inc.
2626
*/
2727

2828
#include <sys/dataset_kstats.h>
@@ -971,16 +971,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
971971
return (0);
972972
}
973973

974-
void
975-
zvol_os_clear_private(zvol_state_t *zv)
976-
{
977-
/*
978-
* Cleared while holding zvol_state_lock as a writer
979-
* which will prevent zvol_open() from opening it.
980-
*/
981-
zv->zv_zso->zvo_disk->private_data = NULL;
982-
}
983-
984974
/*
985975
* Provide a simple virtual geometry for legacy compatibility. For devices
986976
* smaller than 1 MiB a small head and sector count is used to allow very
@@ -1417,6 +1407,54 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
14171407
return (ret);
14181408
}
14191409

1410+
void
1411+
zvol_os_remove_minor(zvol_state_t *zv)
1412+
{
1413+
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1414+
ASSERT0(zv->zv_open_count);
1415+
ASSERT0(atomic_read(&zv->zv_suspend_ref));
1416+
ASSERT(zv->zv_flags & ZVOL_REMOVING);
1417+
1418+
/*
1419+
* Cleared while holding zvol_state_lock as a writer
1420+
* which will prevent zvol_open() from opening it.
1421+
*/
1422+
struct zvol_state_os *zso = zv->zv_zso;
1423+
zv->zv_zso = NULL;
1424+
1425+
/* Clearing private_data will make new callers return immediately. */
1426+
zso->zvo_disk->private_data = NULL;
1427+
1428+
/*
1429+
* Drop the state lock before calling del_gendisk(). There may be
1430+
* callers waiting to acquire it, but del_gendisk() will block until
1431+
* they exit, which would deadlock.
1432+
*/
1433+
mutex_exit(&zv->zv_state_lock);
1434+
1435+
del_gendisk(zso->zvo_disk);
1436+
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1437+
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1438+
#if defined(HAVE_BLK_CLEANUP_DISK)
1439+
blk_cleanup_disk(zso->zvo_disk);
1440+
#else
1441+
put_disk(zso->zvo_disk);
1442+
#endif
1443+
#else
1444+
blk_cleanup_queue(zso->zvo_queue);
1445+
put_disk(zso->zvo_disk);
1446+
#endif
1447+
1448+
if (zso->use_blk_mq)
1449+
blk_mq_free_tag_set(&zso->tag_set);
1450+
1451+
ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
1452+
1453+
kmem_free(zso, sizeof (struct zvol_state_os));
1454+
1455+
mutex_enter(&zv->zv_state_lock);
1456+
}
1457+
14201458
/*
14211459
* Cleanup then free a zvol_state_t which was created by zvol_alloc().
14221460
* At this time, the structure is not opened by anyone, is taken off
@@ -1435,35 +1473,19 @@ zvol_os_free(zvol_state_t *zv)
14351473
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
14361474
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
14371475
ASSERT0(zv->zv_open_count);
1438-
ASSERT0P(zv->zv_zso->zvo_disk->private_data);
1476+
ASSERT0P(zv->zv_zso);
1477+
1478+
ASSERT0P(zv->zv_objset);
1479+
ASSERT0P(zv->zv_zilog);
1480+
ASSERT0P(zv->zv_dn);
14391481

14401482
rw_destroy(&zv->zv_suspend_lock);
14411483
zfs_rangelock_fini(&zv->zv_rangelock);
14421484

1443-
del_gendisk(zv->zv_zso->zvo_disk);
1444-
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1445-
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1446-
#if defined(HAVE_BLK_CLEANUP_DISK)
1447-
blk_cleanup_disk(zv->zv_zso->zvo_disk);
1448-
#else
1449-
put_disk(zv->zv_zso->zvo_disk);
1450-
#endif
1451-
#else
1452-
blk_cleanup_queue(zv->zv_zso->zvo_queue);
1453-
put_disk(zv->zv_zso->zvo_disk);
1454-
#endif
1455-
1456-
if (zv->zv_zso->use_blk_mq)
1457-
blk_mq_free_tag_set(&zv->zv_zso->tag_set);
1458-
1459-
ida_simple_remove(&zvol_ida,
1460-
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
1461-
14621485
cv_destroy(&zv->zv_removing_cv);
14631486
mutex_destroy(&zv->zv_state_lock);
14641487
dataset_kstats_destroy(&zv->zv_kstat);
14651488

1466-
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
14671489
kmem_free(zv, sizeof (zvol_state_t));
14681490
}
14691491

module/zfs/zvol.c

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
3939
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
4040
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
41-
* Copyright (c) 2024, Klara, Inc.
41+
* Copyright (c) 2024, 2025, Klara, Inc.
4242
*/
4343

4444
/*
@@ -1599,8 +1599,8 @@ zvol_remove_minor_task(void *arg)
15991599
rw_enter(&zvol_state_lock, RW_WRITER);
16001600
mutex_enter(&zv->zv_state_lock);
16011601

1602+
zvol_os_remove_minor(zv);
16021603
zvol_remove(zv);
1603-
zvol_os_clear_private(zv);
16041604

16051605
mutex_exit(&zv->zv_state_lock);
16061606
rw_exit(&zvol_state_lock);
@@ -1669,9 +1669,9 @@ zvol_remove_minors_impl(zvol_task_t *task)
16691669
* If in use, try to throw everyone off and try again
16701670
* later.
16711671
*/
1672+
zv->zv_flags |= ZVOL_REMOVING;
16721673
if (zv->zv_open_count > 0 ||
16731674
atomic_read(&zv->zv_suspend_ref)) {
1674-
zv->zv_flags |= ZVOL_REMOVING;
16751675
t = taskq_dispatch(
16761676
zv->zv_objset->os_spa->spa_zvol_taskq,
16771677
zvol_remove_minor_task, zv, TQ_SLEEP);
@@ -1687,14 +1687,9 @@ zvol_remove_minors_impl(zvol_task_t *task)
16871687
continue;
16881688
}
16891689

1690+
zvol_os_remove_minor(zv);
16901691
zvol_remove(zv);
16911692

1692-
/*
1693-
* Cleared while holding zvol_state_lock as a writer
1694-
* which will prevent zvol_open() from opening it.
1695-
*/
1696-
zvol_os_clear_private(zv);
1697-
16981693
/* Drop zv_state_lock before zvol_free() */
16991694
mutex_exit(&zv->zv_state_lock);
17001695

0 commit comments

Comments
 (0)