Skip to content

Commit 8a0e5e8

Browse files
robnbehlendorf
authored andcommitted
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol lists (zvol_find_by_name()), but has also been used to control access to OS-side private data, accessed through whatever kernel object is used to represent the volume (gendisk, geom, etc). This appears to have been necessary to some degree because the OS-side object is what's used to get a handle on zvol_state_t, so zv_state_lock and zv_suspend_lock can't be used to manage access, but also, with the private object and the zvol_state_t being shutdown and destroyed at the same time in zvol_os_free(), we must ensure that the private object pointer only ever corresponds to a real zvol_state_t, not one in partial destruction. Taking the global lock seems like a convenient way to ensure this. The problem with this is that zvol_state_lock does not actually protect access to the zvol_state_t internals, so we need to take zv_state_lock and/or zv_suspend_lock. If those are contended, this can then cause OS-side operations (eg zvol_open()) to sleep to wait for them while hold zvol_state_lock. This then blocks out all other OS-side operations which want to get the private data, and any ZFS-side control operations that would take the write half of the lock. It's even worse if ZFS-side operations induce OS-side calls back into the zvol (eg creating a zvol triggers a partition probe inside the kernel, and also a userspace access from udev to set up device links). And it gets even works again if anything decides to defer those ops to a task and wait on them, which zvol_remove_minors_impl() will do under high load. However, since the previous commit, we have a guarantee that the private data pointer will always be NULL'd out in zvol_os_remove_minor() _before_ the zvol_state_t is made invalid, but it won't happen until all users are ejected. So, if we make access to the private object pointer atomic, we remove the need to take a global lockout to access it, and so we can remove all acquisitions of zvol_state_lock from the OS side. While here, I've rewritten much of the locking theory comment at the top of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've tried to describe the purpose of each lock in a little more detail, and in particular describe where it should and shouldn't be used. Sponsored-by: Klara, Inc. Sponsored-by: Railway Corporation Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Fedor Uporov <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes #17625
1 parent 96f9d27 commit 8a0e5e8

File tree

3 files changed

+134
-113
lines changed

3 files changed

+134
-113
lines changed

module/os/freebsd/zfs/zvol_os.c

Lines changed: 67 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -225,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
225225
}
226226

227227
retry:
228-
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229-
/*
230-
* Obtain a copy of private under zvol_state_lock to make sure either
231-
* the result of zvol free code setting private to NULL is observed,
232-
* or the zv is protected from being freed because of the positive
233-
* zv_open_count.
234-
*/
235-
zv = pp->private;
236-
if (zv == NULL) {
237-
rw_exit(&zvol_state_lock);
238-
err = SET_ERROR(ENXIO);
239-
goto out_locked;
240-
}
228+
zv = atomic_load_ptr(&pp->private);
229+
if (zv == NULL)
230+
return (SET_ERROR(ENXIO));
241231

242232
mutex_enter(&zv->zv_state_lock);
243233
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
244-
rw_exit(&zvol_state_lock);
245234
err = SET_ERROR(ENXIO);
246-
goto out_zv_locked;
235+
goto out_locked;
247236
}
248237
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
249238

@@ -256,16 +245,31 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
256245
drop_suspend = B_TRUE;
257246
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
258247
mutex_exit(&zv->zv_state_lock);
248+
249+
/*
250+
* Removal may happen while the locks are down, so
251+
* we can't trust zv any longer; we have to start over.
252+
*/
253+
zv = atomic_load_ptr(&pp->private);
254+
if (zv == NULL)
255+
return (SET_ERROR(ENXIO));
256+
259257
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
260258
mutex_enter(&zv->zv_state_lock);
259+
260+
if (zv->zv_zso->zso_dying ||
261+
zv->zv_flags & ZVOL_REMOVING) {
262+
err = SET_ERROR(ENXIO);
263+
goto out_locked;
264+
}
265+
261266
/* Check to see if zv_suspend_lock is needed. */
262267
if (zv->zv_open_count != 0) {
263268
rw_exit(&zv->zv_suspend_lock);
264269
drop_suspend = B_FALSE;
265270
}
266271
}
267272
}
268-
rw_exit(&zvol_state_lock);
269273

270274
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
271275

@@ -293,7 +297,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
293297
if (drop_namespace)
294298
mutex_exit(&spa_namespace_lock);
295299
if (err)
296-
goto out_zv_locked;
300+
goto out_locked;
297301
pp->mediasize = zv->zv_volsize;
298302
pp->stripeoffset = 0;
299303
pp->stripesize = zv->zv_volblocksize;
@@ -328,9 +332,8 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
328332
zvol_last_close(zv);
329333
wakeup(zv);
330334
}
331-
out_zv_locked:
332-
mutex_exit(&zv->zv_state_lock);
333335
out_locked:
336+
mutex_exit(&zv->zv_state_lock);
334337
if (drop_suspend)
335338
rw_exit(&zv->zv_suspend_lock);
336339
return (err);
@@ -344,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
344347
boolean_t drop_suspend = B_TRUE;
345348
int new_open_count;
346349

347-
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
348-
zv = pp->private;
349-
if (zv == NULL) {
350-
rw_exit(&zvol_state_lock);
350+
zv = atomic_load_ptr(&pp->private);
351+
if (zv == NULL)
351352
return (SET_ERROR(ENXIO));
352-
}
353353

354354
mutex_enter(&zv->zv_state_lock);
355355
if (zv->zv_flags & ZVOL_EXCL) {
@@ -376,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
376376
mutex_exit(&zv->zv_state_lock);
377377
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
378378
mutex_enter(&zv->zv_state_lock);
379+
380+
/*
381+
* Unlike in zvol_geom_open(), we don't check if
382+
* removal started here, because we might be one of the
383+
* openers that needs to be thrown out! If we're the
384+
* last, we need to call zvol_last_close() below to
385+
* finish cleanup. So, no special treatment for us.
386+
*/
387+
379388
/* Check to see if zv_suspend_lock is needed. */
380389
new_open_count = zv->zv_open_count - count;
381390
if (new_open_count != 0) {
@@ -386,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
386395
} else {
387396
drop_suspend = B_FALSE;
388397
}
389-
rw_exit(&zvol_state_lock);
390398

391399
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
392400

@@ -439,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
439447
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
440448
pp->name, acr, acw, ace));
441449

442-
if (pp->private == NULL) {
450+
if (atomic_load_ptr(&pp->private) == NULL) {
443451
if (acr <= 0 && acw <= 0 && ace <= 0)
444452
return (0);
445453
return (pp->error);
@@ -906,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
906914
boolean_t drop_suspend = B_FALSE;
907915

908916
retry:
909-
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
910-
/*
911-
* Obtain a copy of si_drv2 under zvol_state_lock to make sure either
912-
* the result of zvol free code setting si_drv2 to NULL is observed,
913-
* or the zv is protected from being freed because of the positive
914-
* zv_open_count.
915-
*/
916-
zv = dev->si_drv2;
917-
if (zv == NULL) {
918-
rw_exit(&zvol_state_lock);
919-
err = SET_ERROR(ENXIO);
920-
goto out_locked;
921-
}
917+
zv = atomic_load_ptr(&dev->si_drv2);
918+
if (zv == NULL)
919+
return (SET_ERROR(ENXIO));
922920

923921
mutex_enter(&zv->zv_state_lock);
924-
if (zv->zv_zso->zso_dying) {
925-
rw_exit(&zvol_state_lock);
922+
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
926923
err = SET_ERROR(ENXIO);
927-
goto out_zv_locked;
924+
goto out_locked;
928925
}
929926
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
930927

@@ -939,14 +936,20 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
939936
mutex_exit(&zv->zv_state_lock);
940937
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
941938
mutex_enter(&zv->zv_state_lock);
939+
940+
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
941+
/* Removal started while locks were down. */
942+
err = SET_ERROR(ENXIO);
943+
goto out_locked;
944+
}
945+
942946
/* Check to see if zv_suspend_lock is needed. */
943947
if (zv->zv_open_count != 0) {
944948
rw_exit(&zv->zv_suspend_lock);
945949
drop_suspend = B_FALSE;
946950
}
947951
}
948952
}
949-
rw_exit(&zvol_state_lock);
950953

951954
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
952955

@@ -974,7 +977,7 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
974977
if (drop_namespace)
975978
mutex_exit(&spa_namespace_lock);
976979
if (err)
977-
goto out_zv_locked;
980+
goto out_locked;
978981
}
979982

980983
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1001,9 +1004,8 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
10011004
zvol_last_close(zv);
10021005
wakeup(zv);
10031006
}
1004-
out_zv_locked:
1005-
mutex_exit(&zv->zv_state_lock);
10061007
out_locked:
1008+
mutex_exit(&zv->zv_state_lock);
10071009
if (drop_suspend)
10081010
rw_exit(&zv->zv_suspend_lock);
10091011
return (err);
@@ -1015,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
10151017
zvol_state_t *zv;
10161018
boolean_t drop_suspend = B_TRUE;
10171019

1018-
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1019-
zv = dev->si_drv2;
1020-
if (zv == NULL) {
1021-
rw_exit(&zvol_state_lock);
1020+
zv = atomic_load_ptr(&dev->si_drv2);
1021+
if (zv == NULL)
10221022
return (SET_ERROR(ENXIO));
1023-
}
10241023

10251024
mutex_enter(&zv->zv_state_lock);
10261025
if (zv->zv_flags & ZVOL_EXCL) {
@@ -1045,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
10451044
mutex_exit(&zv->zv_state_lock);
10461045
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
10471046
mutex_enter(&zv->zv_state_lock);
1047+
1048+
/*
1049+
* Unlike in zvol_cdev_open(), we don't check if
1050+
* removal started here, because we might be one of the
1051+
* openers that needs to be thrown out! If we're the
1052+
* last, we need to call zvol_last_close() below to
1053+
* finish cleanup. So, no special treatment for us.
1054+
*/
1055+
10481056
/* Check to see if zv_suspend_lock is needed. */
10491057
if (zv->zv_open_count != 1) {
10501058
rw_exit(&zv->zv_suspend_lock);
@@ -1054,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
10541062
} else {
10551063
drop_suspend = B_FALSE;
10561064
}
1057-
rw_exit(&zvol_state_lock);
10581065

10591066
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
10601067

@@ -1086,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
10861093
int error;
10871094
boolean_t sync;
10881095

1089-
zv = dev->si_drv2;
1096+
zv = atomic_load_ptr(&dev->si_drv2);
1097+
ASSERT3P(zv, !=, NULL);
10901098

10911099
error = 0;
10921100
KASSERT(zv->zv_open_count > 0,
@@ -1147,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
11471155
*(off_t *)data = 0;
11481156
break;
11491157
case DIOCGATTR: {
1158+
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
11501159
spa_t *spa = dmu_objset_spa(zv->zv_objset);
11511160
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
11521161
uint64_t refd, avail, usedobjs, availobjs;
@@ -1171,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
11711180
arg->value.off = refd / DEV_BSIZE;
11721181
} else
11731182
error = SET_ERROR(ENOIOCTL);
1183+
rw_exit(&zv->zv_suspend_lock);
11741184
break;
11751185
}
11761186
case FIOSEEKHOLE:
@@ -1181,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
11811191

11821192
hole = (cmd == FIOSEEKHOLE);
11831193
noff = *off;
1194+
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
11841195
lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
11851196
RL_READER);
11861197
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
11871198
zfs_rangelock_exit(lr);
1199+
rw_exit(&zv->zv_suspend_lock);
11881200
*off = noff;
11891201
break;
11901202
}
@@ -1398,7 +1410,7 @@ zvol_os_remove_minor(zvol_state_t *zv)
13981410
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
13991411
struct zvol_state_geom *zsg = &zso->zso_geom;
14001412
struct g_provider *pp = zsg->zsg_provider;
1401-
pp->private = NULL;
1413+
atomic_store_ptr(&pp->private, NULL);
14021414
mutex_exit(&zv->zv_state_lock);
14031415

14041416
g_topology_lock();
@@ -1409,7 +1421,7 @@ zvol_os_remove_minor(zvol_state_t *zv)
14091421
struct cdev *dev = zsd->zsd_cdev;
14101422

14111423
if (dev != NULL)
1412-
dev->si_drv2 = NULL;
1424+
atomic_store_ptr(&dev->si_drv2, NULL);
14131425
mutex_exit(&zv->zv_state_lock);
14141426

14151427
if (dev != NULL) {

0 commit comments

Comments
 (0)