@@ -833,6 +833,14 @@ vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
833833 return (kid -> v_read (kid , bp , buf , offset , bytes ));
834834}
835835
836+ /*
837+ * List of vdevs that were fully initialized from their own label, but later a
838+ * newer label was found that obsoleted the stale label, freeing its
839+ * configuration tree. We keep those vdevs around, since a new configuration
840+ * may include them.
841+ */
842+ static vdev_list_t orphans = STAILQ_HEAD_INITIALIZER (orphans );
843+
836844static vdev_t *
837845vdev_find (vdev_list_t * list , uint64_t guid )
838846{
@@ -854,6 +862,11 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
854862 vdev_t * vdev ;
855863 vdev_indirect_config_t * vic ;
856864
865+ if ((vdev = vdev_find (& orphans , guid ))) {
866+ STAILQ_REMOVE (& orphans , vdev , vdev , v_childlink );
867+ return (vdev );
868+ }
869+
857870 vdev = calloc (1 , sizeof (vdev_t ));
858871 if (vdev != NULL ) {
859872 STAILQ_INIT (& vdev -> v_children );
@@ -1101,8 +1114,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
11011114}
11021115
11031116static int
1104- vdev_from_nvlist (spa_t * spa , uint64_t top_guid , uint64_t txg ,
1105- const nvlist_t * nvlist )
1117+ vdev_from_nvlist (spa_t * spa , uint64_t top_guid , uint64_t label_guid ,
1118+ uint64_t txg , const nvlist_t * nvlist )
11061119{
11071120 vdev_t * top_vdev , * vdev ;
11081121 nvlist_t * * kids = NULL ;
@@ -1116,6 +1129,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
11161129 return (rc );
11171130 top_vdev -> v_spa = spa ;
11181131 top_vdev -> v_top = top_vdev ;
1132+ top_vdev -> v_label = label_guid ;
11191133 top_vdev -> v_txg = txg ;
11201134 (void )vdev_insert (spa -> spa_root_vdev , top_vdev );
11211135 }
@@ -1160,12 +1174,14 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
11601174static int
11611175vdev_init_from_label (spa_t * spa , const nvlist_t * nvlist )
11621176{
1163- uint64_t top_guid , txg ;
1177+ uint64_t top_guid , label_guid , txg ;
11641178 nvlist_t * vdevs ;
11651179 int rc ;
11661180
11671181 if (nvlist_find (nvlist , ZPOOL_CONFIG_TOP_GUID , DATA_TYPE_UINT64 ,
11681182 NULL , & top_guid , NULL ) ||
1183+ nvlist_find (nvlist , ZPOOL_CONFIG_GUID , DATA_TYPE_UINT64 ,
1184+ NULL , & label_guid , NULL ) != 0 ||
11691185 nvlist_find (nvlist , ZPOOL_CONFIG_POOL_TXG , DATA_TYPE_UINT64 ,
11701186 NULL , & txg , NULL ) != 0 ||
11711187 nvlist_find (nvlist , ZPOOL_CONFIG_VDEV_TREE , DATA_TYPE_NVLIST ,
@@ -1174,7 +1190,7 @@ vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
11741190 return (ENOENT );
11751191 }
11761192
1177- rc = vdev_from_nvlist (spa , top_guid , txg , vdevs );
1193+ rc = vdev_from_nvlist (spa , top_guid , label_guid , txg , vdevs );
11781194 nvlist_destroy (vdevs );
11791195 return (rc );
11801196}
@@ -1271,7 +1287,10 @@ vdev_free(struct vdev *vdev)
12711287
12721288 STAILQ_FOREACH_SAFE (kid , & vdev -> v_children , v_childlink , safe )
12731289 vdev_free (kid );
1274- free (vdev );
1290+ if (vdev -> v_phys_read != NULL )
1291+ STAILQ_INSERT_HEAD (& orphans , vdev , v_childlink );
1292+ else
1293+ free (vdev );
12751294}
12761295
12771296static int
@@ -1323,7 +1342,7 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
13231342 * XXXGL: how can this happen?
13241343 */
13251344 if (vdev == NULL )
1326- rc = vdev_from_nvlist (spa , guid , 0 , kids [i ]);
1345+ rc = vdev_from_nvlist (spa , guid , 0 , 0 , kids [i ]);
13271346 else
13281347 rc = vdev_update_from_nvlist (spa -> spa_root_vdev , guid ,
13291348 kids [i ]);
@@ -1344,6 +1363,53 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
13441363 return (rc );
13451364}
13461365
1366+ static bool
1367+ nvlist_find_child_guid (const nvlist_t * nvlist , uint64_t guid )
1368+ {
1369+ nvlist_t * * kids = NULL ;
1370+ int nkids , i ;
1371+ bool rv = false;
1372+
1373+ if (nvlist_find (nvlist , ZPOOL_CONFIG_CHILDREN , DATA_TYPE_NVLIST_ARRAY ,
1374+ & nkids , & kids , NULL ) != 0 )
1375+ nkids = 0 ;
1376+
1377+ for (i = 0 ; i < nkids ; i ++ ) {
1378+ uint64_t kid_guid ;
1379+
1380+ if (nvlist_find (kids [i ], ZPOOL_CONFIG_GUID , DATA_TYPE_UINT64 ,
1381+ NULL , & kid_guid , NULL ) != 0 )
1382+ break ;
1383+ if (kid_guid == guid )
1384+ rv = true;
1385+ else
1386+ rv = nvlist_find_child_guid (kids [i ], guid );
1387+ if (rv )
1388+ break ;
1389+ }
1390+
1391+ for (i = 0 ; i < nkids ; i ++ )
1392+ nvlist_destroy (kids [i ]);
1393+ free (kids );
1394+
1395+ return (rv );
1396+ }
1397+
1398+ static bool
1399+ nvlist_find_vdev_guid (const nvlist_t * nvlist , uint64_t guid )
1400+ {
1401+ nvlist_t * vdevs ;
1402+ bool rv ;
1403+
1404+ if (nvlist_find (nvlist , ZPOOL_CONFIG_VDEV_TREE , DATA_TYPE_NVLIST , NULL ,
1405+ & vdevs , NULL ) != 0 )
1406+ return (false);
1407+ rv = nvlist_find_child_guid (vdevs , guid );
1408+ nvlist_destroy (vdevs );
1409+
1410+ return (rv );
1411+ }
1412+
13471413static spa_t *
13481414spa_find_by_guid (uint64_t guid )
13491415{
@@ -2012,7 +2078,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
20122078{
20132079 vdev_t vtmp ;
20142080 spa_t * spa ;
2015- vdev_t * vdev ;
2081+ vdev_t * vdev , * top ;
20162082 nvlist_t * nvl ;
20172083 uint64_t val ;
20182084 uint64_t guid , pool_guid , top_guid , txg ;
@@ -2109,22 +2175,47 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
21092175 nvlist_destroy (nvl );
21102176 return (ENOMEM );
21112177 }
2112- } else {
2113- struct vdev * kid ;
2114-
2115- STAILQ_FOREACH (kid , & spa -> spa_root_vdev -> v_children ,
2116- v_childlink )
2117- if (kid -> v_guid == top_guid && kid -> v_txg < txg ) {
2118- printf ("ZFS: pool %s vdev %s ignoring stale "
2119- "label from txg 0x%jx, using 0x%jx@0x%jx\n" ,
2120- spa -> spa_name , kid -> v_name ,
2121- kid -> v_txg , guid , txg );
2178+ }
2179+
2180+ /*
2181+ * Check if configuration is already known. If configuration is known
2182+ * and txg numbers don't match, we got 2x2 scenarios here. First, is
2183+ * the label being read right now _newer_ than the one read before.
2184+ * Second, is the vdev that provided the stale label _present_ in the
2185+ * newer configuration. If neither is true, we completely ignore the
2186+ * label.
2187+ */
2188+ STAILQ_FOREACH (top , & spa -> spa_root_vdev -> v_children , v_childlink )
2189+ if (top -> v_guid == top_guid ) {
2190+ bool newer , present ;
2191+
2192+ if (top -> v_txg == txg )
2193+ break ;
2194+ newer = (top -> v_txg < txg );
2195+ present = newer ?
2196+ nvlist_find_vdev_guid (nvl , top -> v_label ) :
2197+ (vdev_find (& top -> v_children , guid ) != NULL );
2198+ printf ("ZFS: pool %s vdev %s %s stale label from "
2199+ "0x%jx@0x%jx, %s 0x%jx@0x%jx\n" ,
2200+ spa -> spa_name , top -> v_name ,
2201+ present ? "using" : "ignoring" ,
2202+ newer ? top -> v_label : guid ,
2203+ newer ? top -> v_txg : txg ,
2204+ present ? "referred by" : "using" ,
2205+ newer ? guid : top -> v_label ,
2206+ newer ? txg : top -> v_txg );
2207+ if (newer ) {
21222208 STAILQ_REMOVE (& spa -> spa_root_vdev -> v_children ,
2123- kid , vdev , v_childlink );
2124- vdev_free (kid );
2209+ top , vdev , v_childlink );
2210+ vdev_free (top );
21252211 break ;
2212+ } else if (present ) {
2213+ break ;
2214+ } else {
2215+ nvlist_destroy (nvl );
2216+ return (EIO );
21262217 }
2127- }
2218+ }
21282219
21292220 /*
21302221 * Get the vdev tree and create our in-core copy of it.
0 commit comments