@@ -348,15 +348,16 @@ void OSDService::identify_splits_and_merges(
348348 return ;
349349 }
350350 int old_pgnum = old_map->get_pg_num (pgid.pool ());
351- auto p = osd->pg_num_history .pg_nums .find (pgid.pool ());
352- if (p == osd->pg_num_history .pg_nums .end ()) {
351+ if (!osd->pg_num_history .pg_nums .contains (pgid.pool ())) {
353352 dout (20 ) << __func__ << " " << pgid << " pool " << pgid.pool ()
354353 << " has no history" << dendl;
355354 return ;
356355 }
356+ // The pgid's pool [epoch -> pg_num] map
357+ const auto & pool_pg_num_history_map = osd->pg_num_history .pg_nums [pgid.pool ()];
357358 dout (20 ) << __func__ << " " << pgid << " e" << old_map->get_epoch ()
358359 << " to e" << new_map->get_epoch ()
359- << " pg_nums " << p-> second << dendl;
360+ << " pg_nums " << pool_pg_num_history_map << dendl;
360361 deque<spg_t > queue;
361362 queue.push_back (pgid);
362363 set<spg_t > did;
@@ -365,83 +366,87 @@ void OSDService::identify_splits_and_merges(
365366 queue.pop_front ();
366367 did.insert (cur);
367368 unsigned pgnum = old_pgnum;
368- for (auto q = p->second .lower_bound (old_map->get_epoch ());
369- q != p->second .end () &&
370- q->first <= new_map->get_epoch ();
371- ++q) {
372- if (pgnum < q->second ) {
369+ for (auto map_iter = pool_pg_num_history_map.lower_bound (old_map->get_epoch ());
370+ map_iter != pool_pg_num_history_map.end ();
371+ ++map_iter) {
372+ const auto & [new_epoch, new_pgnum] = *map_iter;
373+ if (new_epoch > new_map->get_epoch ()) {
374+ // don't handle any changes recorded later than new_map's epoch
375+ break ;
376+ }
377+ if (pgnum < new_pgnum) {
373378 // split?
374379 if (cur.ps () < pgnum) {
375380 set<spg_t > children;
376- if (cur.is_split (pgnum, q-> second , &children)) {
377- dout (20 ) << __func__ << " " << cur << " e" << q-> first
378- << " pg_num " << pgnum << " -> " << q-> second
381+ if (cur.is_split (pgnum, new_pgnum , &children)) {
382+ dout (20 ) << __func__ << " " << cur << " e" << new_epoch
383+ << " pg_num " << pgnum << " -> " << new_pgnum
379384 << " children " << children << dendl;
380385 for (auto i : children) {
381- split_children->insert (make_pair (i, q-> first ));
386+ split_children->insert (make_pair (i, new_epoch ));
382387 if (!did.count (i))
383388 queue.push_back (i);
384389 }
385390 }
386- } else if (cur.ps () < q-> second ) {
387- dout (20 ) << __func__ << " " << cur << " e" << q-> first
388- << " pg_num " << pgnum << " -> " << q-> second
391+ } else if (cur.ps () < new_pgnum ) {
392+ dout (20 ) << __func__ << " " << cur << " e" << new_epoch
393+ << " pg_num " << pgnum << " -> " << new_pgnum
389394 << " is a child" << dendl;
390395 // normally we'd capture this from the parent, but it's
391396 // possible the parent doesn't exist yet (it will be
392397 // fabricated to allow an intervening merge). note this PG
393398 // as a split child here to be sure we catch it.
394- split_children->insert (make_pair (cur, q-> first ));
399+ split_children->insert (make_pair (cur, new_epoch ));
395400 } else {
396- dout (20 ) << __func__ << " " << cur << " e" << q-> first
397- << " pg_num " << pgnum << " -> " << q-> second
401+ dout (20 ) << __func__ << " " << cur << " e" << new_epoch
402+ << " pg_num " << pgnum << " -> " << new_pgnum
398403 << " is post-split, skipping" << dendl;
399404 }
400405 } else if (merge_pgs) {
401406 // merge?
402- if (cur.ps () >= q-> second ) {
407+ if (cur.ps () >= new_pgnum ) {
403408 if (cur.ps () < pgnum) {
404409 spg_t parent;
405- if (cur.is_merge_source (pgnum, q-> second , &parent)) {
410+ if (cur.is_merge_source (pgnum, new_pgnum , &parent)) {
406411 set<spg_t > children;
407- parent.is_split (q-> second , pgnum, &children);
408- dout (20 ) << __func__ << " " << cur << " e" << q-> first
409- << " pg_num " << pgnum << " -> " << q-> second
412+ parent.is_split (new_pgnum , pgnum, &children);
413+ dout (20 ) << __func__ << " " << cur << " e" << new_epoch
414+ << " pg_num " << pgnum << " -> " << new_pgnum
410415 << " is merge source, target " << parent
411416 << " , source(s) " << children << dendl;
412- merge_pgs->insert (make_pair (parent, q-> first ));
417+ merge_pgs->insert (make_pair (parent, new_epoch ));
413418 if (!did.count (parent)) {
414419 // queue (and re-scan) parent in case it might not exist yet
415420 // and there are some future splits pending on it
416421 queue.push_back (parent);
417422 }
418423 for (auto c : children) {
419- merge_pgs->insert (make_pair (c, q-> first ));
424+ merge_pgs->insert (make_pair (c, new_epoch ));
420425 if (!did.count (c))
421426 queue.push_back (c);
422427 }
423428 }
424429 } else {
425- dout (20 ) << __func__ << " " << cur << " e" << q-> first
426- << " pg_num " << pgnum << " -> " << q-> second
430+ dout (20 ) << __func__ << " " << cur << " e" << new_epoch
431+ << " pg_num " << pgnum << " -> " << new_pgnum
427432 << " is beyond old pgnum, skipping" << dendl;
428433 }
429434 } else {
430435 set<spg_t > children;
431- if (cur.is_split (q-> second , pgnum, &children)) {
432- dout (20 ) << __func__ << " " << cur << " e" << q-> first
433- << " pg_num " << pgnum << " -> " << q-> second
436+ if (cur.is_split (new_pgnum , pgnum, &children)) {
437+ dout (20 ) << __func__ << " " << cur << " e" << new_epoch
438+ << " pg_num " << pgnum << " -> " << new_pgnum
434439 << " is merge target, source " << children << dendl;
435440 for (auto c : children) {
436- merge_pgs->insert (make_pair (c, q-> first ));
441+ merge_pgs->insert (make_pair (c, new_epoch ));
437442 if (!did.count (c))
438443 queue.push_back (c);
439444 }
440- merge_pgs->insert (make_pair (cur, q-> first ));
445+ merge_pgs->insert (make_pair (cur, new_epoch ));
441446 }
442447 }
443448 }
444- pgnum = q-> second ;
449+ pgnum = new_pgnum ;
445450 }
446451 }
447452}
@@ -8277,13 +8282,18 @@ void OSD::handle_osd_map(MOSDMap *m)
82778282 rerequest_full_maps ();
82788283 }
82798284
8285+ track_pools_and_pg_num_changes (added_maps, t);
8286+
82808287 if (!superblock.maps .empty ()) {
82818288 trim_maps (m->cluster_osdmap_trim_lower_bound );
82828289 pg_num_history.prune (superblock.get_oldest_map ());
82838290 }
82848291 superblock.insert_osdmap_epochs (first, last);
82858292 if (superblock.maps .num_intervals () > 1 ) {
8286- dout (10 ) << __func__ << " osd map gap " << superblock.maps << dendl;
8293+ // we had a map gap and not yet trimmed all the way up to
8294+ // cluster_osdmap_trim_lower_bound
8295+ dout (10 ) << __func__ << " osd maps are not contiguous"
8296+ << superblock.maps << dendl;
82878297 }
82888298 superblock.current_epoch = last;
82898299
@@ -8294,54 +8304,6 @@ void OSD::handle_osd_map(MOSDMap *m)
82948304 superblock.clean_thru = last;
82958305 }
82968306
8297- // check for pg_num changes and deleted pools
8298- OSDMapRef lastmap;
8299- for (auto & i : added_maps) {
8300- if (!lastmap) {
8301- if (!(lastmap = service.try_get_map (i.first - 1 ))) {
8302- dout (10 ) << __func__ << " can't get previous map " << i.first - 1
8303- << " probably first start of this osd" << dendl;
8304- continue ;
8305- }
8306- }
8307- ceph_assert (lastmap->get_epoch () + 1 == i.second ->get_epoch ());
8308- for (auto & j : lastmap->get_pools ()) {
8309- if (!i.second ->have_pg_pool (j.first )) {
8310- pg_num_history.log_pool_delete (i.first , j.first );
8311- dout (10 ) << __func__ << " recording final pg_pool_t for pool "
8312- << j.first << dendl;
8313- // this information is needed by _make_pg() if have to restart before
8314- // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8315- ghobject_t obj = make_final_pool_info_oid (j.first );
8316- bufferlist bl;
8317- encode (j.second , bl, CEPH_FEATURES_ALL);
8318- string name = lastmap->get_pool_name (j.first );
8319- encode (name, bl);
8320- map<string,string> profile;
8321- if (lastmap->get_pg_pool (j.first )->is_erasure ()) {
8322- profile = lastmap->get_erasure_code_profile (
8323- lastmap->get_pg_pool (j.first )->erasure_code_profile );
8324- }
8325- encode (profile, bl);
8326- t.write (coll_t::meta (), obj, 0 , bl.length (), bl);
8327- } else if (unsigned new_pg_num = i.second ->get_pg_num (j.first );
8328- new_pg_num != j.second .get_pg_num ()) {
8329- dout (10 ) << __func__ << " recording pool " << j.first << " pg_num "
8330- << j.second .get_pg_num () << " -> " << new_pg_num << dendl;
8331- pg_num_history.log_pg_num_change (i.first , j.first , new_pg_num);
8332- }
8333- }
8334- for (auto & j : i.second ->get_pools ()) {
8335- if (!lastmap->have_pg_pool (j.first )) {
8336- dout (10 ) << __func__ << " recording new pool " << j.first << " pg_num "
8337- << j.second .get_pg_num () << dendl;
8338- pg_num_history.log_pg_num_change (i.first , j.first ,
8339- j.second .get_pg_num ());
8340- }
8341- }
8342- lastmap = i.second ;
8343- }
8344- pg_num_history.epoch = last;
83458307 {
83468308 bufferlist bl;
83478309 ::encode (pg_num_history, bl);
@@ -8385,6 +8347,101 @@ void OSD::handle_osd_map(MOSDMap *m)
83858347 service.publish_superblock (superblock);
83868348}
83878349
8350+ /*
8351+ * Compare between the previous last_map we had to
8352+ * each one of the added_maps.
8353+ * Track all of the changes relevant in pg_num_history.
8354+ */
8355+ void OSD::track_pools_and_pg_num_changes (
8356+ const map<epoch_t ,OSDMapRef>& added_maps,
8357+ ObjectStore::Transaction& t)
8358+ {
8359+ epoch_t first = added_maps.begin ()->first ;
8360+ epoch_t last = added_maps.rbegin ()->first ;
8361+
8362+ // Unless this is the first start of this OSD,
8363+ // lastmap should be the newest_map we have.
8364+ OSDMapRef lastmap;
8365+
8366+ if (superblock.maps .empty ()) {
8367+ dout (10 ) << __func__ << " no maps stored, this is probably "
8368+ << " the first start of this osd" << dendl;
8369+ lastmap = added_maps.at (first);
8370+ } else {
8371+ if (first > superblock.get_newest_map () + 1 ) {
8372+ ceph_assert (first == superblock.cluster_osdmap_trim_lower_bound );
8373+ dout (20 ) << __func__ << " can't get previous map "
8374+ << superblock.get_newest_map ()
8375+ << " first start of this osd after a map gap" << dendl;
8376+ }
8377+ if (!(lastmap =
8378+ service.try_get_map (superblock.get_newest_map ()))) {
8379+ // This is unexpected
8380+ ceph_abort ();
8381+ }
8382+ }
8383+
8384+ // For each added map, record any changes into pg_num_history
8385+ // and update lastmap afterwards.
8386+ for (auto & [current_added_map_epoch, current_added_map] : added_maps) {
8387+ _track_pools_and_pg_num_changes (t, lastmap,
8388+ current_added_map,
8389+ current_added_map_epoch);
8390+ lastmap = current_added_map;
8391+ }
8392+ pg_num_history.epoch = last;
8393+ }
8394+
8395+ void OSD::_track_pools_and_pg_num_changes (
8396+ ObjectStore::Transaction& t,
8397+ const OSDMapRef& lastmap,
8398+ const OSDMapRef& current_added_map,
8399+ epoch_t current_added_map_epoch)
8400+ {
8401+ // 1) Check if a pool was deleted
8402+ for (auto & [pool_id, pg_pool] : lastmap->get_pools ()) {
8403+ if (!current_added_map->have_pg_pool (pool_id)) {
8404+ pg_num_history.log_pool_delete (current_added_map_epoch, pool_id);
8405+ dout (10 ) << __func__ << " recording final pg_pool_t for pool "
8406+ << pool_id << dendl;
8407+ // this information is needed by _make_pg() if have to restart before
8408+ // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8409+ ghobject_t obj = make_final_pool_info_oid (pool_id);
8410+ bufferlist bl;
8411+ encode (pg_pool, bl, CEPH_FEATURES_ALL);
8412+ string name = lastmap->get_pool_name (pool_id);
8413+ encode (name, bl);
8414+ map<string,string> profile;
8415+ if (lastmap->get_pg_pool (pool_id)->is_erasure ()) {
8416+ profile = lastmap->get_erasure_code_profile (
8417+ lastmap->get_pg_pool (pool_id)->erasure_code_profile );
8418+ }
8419+ encode (profile, bl);
8420+ t.write (coll_t::meta (), obj, 0 , bl.length (), bl);
8421+
8422+ // 2) For existing pools, check if pg_num was changed
8423+ } else if (unsigned new_pg_num = current_added_map->get_pg_num (pool_id);
8424+ new_pg_num != pg_pool.get_pg_num ()) {
8425+ dout (10 ) << __func__ << " recording pool " << pool_id << " pg_num "
8426+ << pg_pool.get_pg_num () << " -> " << new_pg_num << dendl;
8427+ pg_num_history.log_pg_num_change (current_added_map_epoch,
8428+ pool_id,
8429+ new_pg_num);
8430+ }
8431+ }
8432+
8433+ // 3) Check if a pool was created
8434+ for (auto & [pool_id, pg_pool] : current_added_map->get_pools ()) {
8435+ if (!lastmap->have_pg_pool (pool_id)) {
8436+ dout (10 ) << __func__ << " recording new pool " <<pool_id << " pg_num "
8437+ << pg_pool.get_pg_num () << dendl;
8438+ pg_num_history.log_pg_num_change (current_added_map_epoch,
8439+ pool_id,
8440+ pg_pool.get_pg_num ());
8441+ }
8442+ }
8443+ }
8444+
83888445void OSD::_committed_osd_maps (epoch_t first, epoch_t last, MOSDMap *m)
83898446{
83908447 dout (10 ) << __func__ << " " << first << " .." << last << dendl;
0 commit comments