Skip to content

Commit a532815

Browse files
committed
Fix snapshot automount race causing duplicate mounts and AVL tree panic
Multiple threads racing to automount the same snapshot can both spawn mount helper processes that successfully complete, causing both parent threads to attempt AVL tree registration and triggering a VERIFY() panic in avl_add(). This occurs because the fsconfig/fsmount API lacks the serialization provided by traditional mount() via lock_mount(). The fix adds a per-entry mutex (se_mtx) to zfs_snapentry_t that serializes mount and unmount operations on the same snapshot. The first mount thread creates a pending entry with se_spa=NULL and holds se_mtx during the helper execution. Concurrent mounts find the pending entry and return success without spawning duplicate helpers. Unmount waits on se_mtx if a mount is pending, ensuring proper serialization. This allows different snapshots to mount in parallel while preventing the AVL panic. Signed-off-by: Ameer Hamza <[email protected]>
1 parent 7f7d493 commit a532815

File tree

1 file changed

+106
-37
lines changed

1 file changed

+106
-37
lines changed

module/os/linux/zfs/zfs_ctldir.c

Lines changed: 106 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,17 @@ static int zfs_snapshot_no_setuid = 0;
117117
typedef struct {
118118
char *se_name; /* full snapshot name */
119119
char *se_path; /* full mount path */
120-
spa_t *se_spa; /* pool spa */
120+
spa_t *se_spa; /* pool spa (NULL if pending) */
121121
uint64_t se_objsetid; /* snapshot objset id */
122122
struct dentry *se_root_dentry; /* snapshot root dentry */
123123
krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */
124124
taskqid_t se_taskqid; /* scheduled unmount taskqid */
125125
avl_node_t se_node_name; /* zfs_snapshots_by_name link */
126126
avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
127127
zfs_refcount_t se_refcount; /* reference count */
128+
kmutex_t se_mtx; /* serialize auto-mount/unmount */
129+
kcondvar_t se_cv; /* signal mount completion */
130+
boolean_t se_mounting; /* mount operation in progress */
128131
} zfs_snapentry_t;
129132

130133
static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
@@ -148,6 +151,9 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
148151
se->se_root_dentry = root_dentry;
149152
se->se_taskqid = TASKQID_INVALID;
150153
rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);
154+
mutex_init(&se->se_mtx, NULL, MUTEX_DEFAULT, NULL);
155+
cv_init(&se->se_cv, NULL, CV_DEFAULT, NULL);
156+
se->se_mounting = B_FALSE;
151157

152158
zfs_refcount_create(&se->se_refcount);
153159

@@ -165,6 +171,8 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
165171
kmem_strfree(se->se_name);
166172
kmem_strfree(se->se_path);
167173
rw_destroy(&se->se_taskqid_lock);
174+
mutex_destroy(&se->se_mtx);
175+
cv_destroy(&se->se_cv);
168176

169177
kmem_free(se, sizeof (zfs_snapentry_t));
170178
}
@@ -190,34 +198,52 @@ zfsctl_snapshot_rele(zfs_snapentry_t *se)
190198
}
191199

192200
/*
193-
* Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
194-
* zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
195-
* of the trees a reference is held.
201+
* Add a zfs_snapentry_t to the zfs_snapshots_by_name tree. If the entry
202+
* is not pending (se_spa != NULL), also add to zfs_snapshots_by_objsetid.
203+
* While the zfs_snapentry_t is part of the trees a reference is held.
196204
*/
197205
static void
198206
zfsctl_snapshot_add(zfs_snapentry_t *se)
199207
{
200208
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
201209
zfsctl_snapshot_hold(se);
202210
avl_add(&zfs_snapshots_by_name, se);
203-
avl_add(&zfs_snapshots_by_objsetid, se);
211+
if (se->se_spa != NULL)
212+
avl_add(&zfs_snapshots_by_objsetid, se);
204213
}
205214

206215
/*
207-
* Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
208-
* zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
209-
* this can result in the structure being freed if that was the last
210-
* remaining reference.
216+
* Remove a zfs_snapentry_t from the zfs_snapshots_by_name tree and
217+
* zfs_snapshots_by_objsetid tree (if not pending). Upon removal a
218+
* reference is dropped, this can result in the structure being freed
219+
* if that was the last remaining reference.
211220
*/
212221
static void
213222
zfsctl_snapshot_remove(zfs_snapentry_t *se)
214223
{
215224
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
216225
avl_remove(&zfs_snapshots_by_name, se);
217-
avl_remove(&zfs_snapshots_by_objsetid, se);
226+
if (se->se_spa != NULL)
227+
avl_remove(&zfs_snapshots_by_objsetid, se);
218228
zfsctl_snapshot_rele(se);
219229
}
220230

231+
/*
232+
* Fill a pending zfs_snapentry_t after mount succeeds. Fills in the
233+
* remaining fields and adds the entry to the zfs_snapshots_by_objsetid tree.
234+
*/
235+
static void
236+
zfsctl_snapshot_fill(zfs_snapentry_t *se, spa_t *spa, uint64_t objsetid,
237+
struct dentry *root_dentry)
238+
{
239+
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
240+
ASSERT3P(se->se_spa, ==, NULL);
241+
se->se_spa = spa;
242+
se->se_objsetid = objsetid;
243+
se->se_root_dentry = root_dentry;
244+
avl_add(&zfs_snapshots_by_objsetid, se);
245+
}
246+
221247
/*
222248
* Snapshot name comparison function for the zfs_snapshots_by_name.
223249
*/
@@ -315,6 +341,11 @@ zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
315341
se = zfsctl_snapshot_find_by_name(old_snapname);
316342
if (se == NULL)
317343
return (SET_ERROR(ENOENT));
344+
if (se->se_spa == NULL) {
345+
/* Snapshot mount is in progress */
346+
zfsctl_snapshot_rele(se);
347+
return (SET_ERROR(EBUSY));
348+
}
318349

319350
zfsctl_snapshot_remove(se);
320351
kmem_strfree(se->se_name);
@@ -437,26 +468,6 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
437468
return (error);
438469
}
439470

440-
/*
441-
* Check if snapname is currently mounted. Returned non-zero when mounted
442-
* and zero when unmounted.
443-
*/
444-
static boolean_t
445-
zfsctl_snapshot_ismounted(const char *snapname)
446-
{
447-
zfs_snapentry_t *se;
448-
boolean_t ismounted = B_FALSE;
449-
450-
rw_enter(&zfs_snapshot_lock, RW_READER);
451-
if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
452-
zfsctl_snapshot_rele(se);
453-
ismounted = B_TRUE;
454-
}
455-
rw_exit(&zfs_snapshot_lock);
456-
457-
return (ismounted);
458-
}
459-
460471
/*
461472
* Check if the given inode is a part of the virtual .zfs directory.
462473
*/
@@ -1133,13 +1144,22 @@ zfsctl_snapshot_unmount(const char *snapname, int flags)
11331144
}
11341145
rw_exit(&zfs_snapshot_lock);
11351146

1147+
/*
1148+
* Wait for any pending auto-mount to complete before unmounting.
1149+
*/
1150+
mutex_enter(&se->se_mtx);
1151+
while (se->se_mounting)
1152+
cv_wait(&se->se_cv, &se->se_mtx);
1153+
11361154
exportfs_flush();
11371155

11381156
if (flags & MNT_FORCE)
11391157
argv[4] = "-fn";
11401158
argv[5] = se->se_path;
11411159
dprintf("unmount; path=%s\n", se->se_path);
11421160
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
1161+
1162+
mutex_exit(&se->se_mtx);
11431163
zfsctl_snapshot_rele(se);
11441164

11451165

@@ -1234,14 +1254,40 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12341254
zfs_snapshot_no_setuid ? "nosuid" : "suid");
12351255

12361256
/*
1237-
* Multiple concurrent automounts of a snapshot are never allowed.
1238-
* The snapshot may be manually mounted as many times as desired.
1257+
* Create a pending AVL entry for the snapshot to prevent concurrent
1258+
* automount attempts. If an entry already exists, the snapshot is
1259+
* either mounted or being mounted by another thread, so return
1260+
* success without starting a duplicate mount operation. returning
1261+
* EBUSY would mean that VFS would retry, which we do not want.
12391262
*/
1240-
if (zfsctl_snapshot_ismounted(full_name)) {
1263+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1264+
if ((se = zfsctl_snapshot_find_by_name(full_name)) != NULL) {
1265+
rw_exit(&zfs_snapshot_lock);
1266+
1267+
/*
1268+
* Entry exists - wait for pending auto-mount to complete.
1269+
*/
1270+
mutex_enter(&se->se_mtx);
1271+
while (se->se_mounting)
1272+
cv_wait(&se->se_cv, &se->se_mtx);
1273+
mutex_exit(&se->se_mtx);
1274+
zfsctl_snapshot_rele(se);
12411275
error = 0;
12421276
goto error;
12431277
}
12441278

1279+
/*
1280+
* Create pending entry (se_spa=NULL), add to AVL tree, and acquire
1281+
* se_mtx to serialize with concurrent unmount operations. Release
1282+
* global lock before long-running mount helper.
1283+
*/
1284+
se = zfsctl_snapshot_alloc(full_name, full_path, NULL, 0, NULL);
1285+
se->se_mounting = B_TRUE;
1286+
zfsctl_snapshot_add(se);
1287+
zfsctl_snapshot_hold(se);
1288+
mutex_enter(&se->se_mtx);
1289+
rw_exit(&zfs_snapshot_lock);
1290+
12451291
/*
12461292
* Attempt to mount the snapshot from user space. Normally this
12471293
* would be done using the vfs_kern_mount() function, however that
@@ -1260,6 +1306,18 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12601306
argv[9] = full_path;
12611307
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
12621308
if (error) {
1309+
/*
1310+
* Mount failed - remove pending entry from AVL, signal waiting
1311+
* threads, release mutex, and drop mount operation reference.
1312+
*/
1313+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1314+
zfsctl_snapshot_remove(se);
1315+
rw_exit(&zfs_snapshot_lock);
1316+
se->se_mounting = B_FALSE;
1317+
cv_broadcast(&se->se_cv);
1318+
mutex_exit(&se->se_mtx);
1319+
zfsctl_snapshot_rele(se);
1320+
12631321
if (!(error & MOUNT_BUSY << 8)) {
12641322
zfs_dbgmsg("Unable to automount %s error=%d",
12651323
full_path, error);
@@ -1291,14 +1349,25 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12911349
spath.mnt->mnt_flags |= MNT_SHRINKABLE;
12921350

12931351
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1294-
se = zfsctl_snapshot_alloc(full_name, full_path,
1295-
snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
1296-
dentry);
1297-
zfsctl_snapshot_add(se);
1352+
zfsctl_snapshot_fill(se, snap_zfsvfs->z_os->os_spa,
1353+
dmu_objset_id(snap_zfsvfs->z_os), dentry);
12981354
zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
12991355
rw_exit(&zfs_snapshot_lock);
1356+
} else {
1357+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1358+
zfsctl_snapshot_remove(se);
1359+
rw_exit(&zfs_snapshot_lock);
13001360
}
13011361
path_put(&spath);
1362+
1363+
/*
1364+
* Signal waiting threads that mount operation has completed.
1365+
* Release mutex and drop mount operation reference.
1366+
*/
1367+
se->se_mounting = B_FALSE;
1368+
cv_broadcast(&se->se_cv);
1369+
mutex_exit(&se->se_mtx);
1370+
zfsctl_snapshot_rele(se);
13021371
error:
13031372
kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
13041373
kmem_free(full_path, MAXPATHLEN);

0 commit comments

Comments
 (0)