Skip to content

Commit 25fb8aa

Browse files
authored
Merge pull request ceph#54988 from Matan-B/wip-matanb-mapgap-fix
osd/OSD: rewrite track_pools_and_pg_num_changes logic Reviewed-by: Samuel Just <[email protected]>
2 parents 51fd35d + 78d7668 commit 25fb8aa

File tree

2 files changed

+151
-82
lines changed

2 files changed

+151
-82
lines changed

src/osd/OSD.cc

Lines changed: 139 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -348,15 +348,16 @@ void OSDService::identify_splits_and_merges(
348348
return;
349349
}
350350
int old_pgnum = old_map->get_pg_num(pgid.pool());
351-
auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
352-
if (p == osd->pg_num_history.pg_nums.end()) {
351+
if (!osd->pg_num_history.pg_nums.contains(pgid.pool())) {
353352
dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
354353
<< " has no history" << dendl;
355354
return;
356355
}
356+
// The pgid's pool [epoch -> pg_num] map
357+
const auto& pool_pg_num_history_map = osd->pg_num_history.pg_nums[pgid.pool()];
357358
dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
358359
<< " to e" << new_map->get_epoch()
359-
<< " pg_nums " << p->second << dendl;
360+
<< " pg_nums " << pool_pg_num_history_map << dendl;
360361
deque<spg_t> queue;
361362
queue.push_back(pgid);
362363
set<spg_t> did;
@@ -365,83 +366,87 @@ void OSDService::identify_splits_and_merges(
365366
queue.pop_front();
366367
did.insert(cur);
367368
unsigned pgnum = old_pgnum;
368-
for (auto q = p->second.lower_bound(old_map->get_epoch());
369-
q != p->second.end() &&
370-
q->first <= new_map->get_epoch();
371-
++q) {
372-
if (pgnum < q->second) {
369+
for (auto map_iter = pool_pg_num_history_map.lower_bound(old_map->get_epoch());
370+
map_iter != pool_pg_num_history_map.end();
371+
++map_iter) {
372+
const auto& [new_epoch, new_pgnum] = *map_iter;
373+
if (new_epoch > new_map->get_epoch()) {
374+
// don't handle any changes recorded later than new_map's epoch
375+
break;
376+
}
377+
if (pgnum < new_pgnum) {
373378
// split?
374379
if (cur.ps() < pgnum) {
375380
set<spg_t> children;
376-
if (cur.is_split(pgnum, q->second, &children)) {
377-
dout(20) << __func__ << " " << cur << " e" << q->first
378-
<< " pg_num " << pgnum << " -> " << q->second
381+
if (cur.is_split(pgnum, new_pgnum, &children)) {
382+
dout(20) << __func__ << " " << cur << " e" << new_epoch
383+
<< " pg_num " << pgnum << " -> " << new_pgnum
379384
<< " children " << children << dendl;
380385
for (auto i : children) {
381-
split_children->insert(make_pair(i, q->first));
386+
split_children->insert(make_pair(i, new_epoch));
382387
if (!did.count(i))
383388
queue.push_back(i);
384389
}
385390
}
386-
} else if (cur.ps() < q->second) {
387-
dout(20) << __func__ << " " << cur << " e" << q->first
388-
<< " pg_num " << pgnum << " -> " << q->second
391+
} else if (cur.ps() < new_pgnum) {
392+
dout(20) << __func__ << " " << cur << " e" << new_epoch
393+
<< " pg_num " << pgnum << " -> " << new_pgnum
389394
<< " is a child" << dendl;
390395
// normally we'd capture this from the parent, but it's
391396
// possible the parent doesn't exist yet (it will be
392397
// fabricated to allow an intervening merge). note this PG
393398
// as a split child here to be sure we catch it.
394-
split_children->insert(make_pair(cur, q->first));
399+
split_children->insert(make_pair(cur, new_epoch));
395400
} else {
396-
dout(20) << __func__ << " " << cur << " e" << q->first
397-
<< " pg_num " << pgnum << " -> " << q->second
401+
dout(20) << __func__ << " " << cur << " e" << new_epoch
402+
<< " pg_num " << pgnum << " -> " << new_pgnum
398403
<< " is post-split, skipping" << dendl;
399404
}
400405
} else if (merge_pgs) {
401406
// merge?
402-
if (cur.ps() >= q->second) {
407+
if (cur.ps() >= new_pgnum) {
403408
if (cur.ps() < pgnum) {
404409
spg_t parent;
405-
if (cur.is_merge_source(pgnum, q->second, &parent)) {
410+
if (cur.is_merge_source(pgnum, new_pgnum, &parent)) {
406411
set<spg_t> children;
407-
parent.is_split(q->second, pgnum, &children);
408-
dout(20) << __func__ << " " << cur << " e" << q->first
409-
<< " pg_num " << pgnum << " -> " << q->second
412+
parent.is_split(new_pgnum, pgnum, &children);
413+
dout(20) << __func__ << " " << cur << " e" << new_epoch
414+
<< " pg_num " << pgnum << " -> " << new_pgnum
410415
<< " is merge source, target " << parent
411416
<< ", source(s) " << children << dendl;
412-
merge_pgs->insert(make_pair(parent, q->first));
417+
merge_pgs->insert(make_pair(parent, new_epoch));
413418
if (!did.count(parent)) {
414419
// queue (and re-scan) parent in case it might not exist yet
415420
// and there are some future splits pending on it
416421
queue.push_back(parent);
417422
}
418423
for (auto c : children) {
419-
merge_pgs->insert(make_pair(c, q->first));
424+
merge_pgs->insert(make_pair(c, new_epoch));
420425
if (!did.count(c))
421426
queue.push_back(c);
422427
}
423428
}
424429
} else {
425-
dout(20) << __func__ << " " << cur << " e" << q->first
426-
<< " pg_num " << pgnum << " -> " << q->second
430+
dout(20) << __func__ << " " << cur << " e" << new_epoch
431+
<< " pg_num " << pgnum << " -> " << new_pgnum
427432
<< " is beyond old pgnum, skipping" << dendl;
428433
}
429434
} else {
430435
set<spg_t> children;
431-
if (cur.is_split(q->second, pgnum, &children)) {
432-
dout(20) << __func__ << " " << cur << " e" << q->first
433-
<< " pg_num " << pgnum << " -> " << q->second
436+
if (cur.is_split(new_pgnum, pgnum, &children)) {
437+
dout(20) << __func__ << " " << cur << " e" << new_epoch
438+
<< " pg_num " << pgnum << " -> " << new_pgnum
434439
<< " is merge target, source " << children << dendl;
435440
for (auto c : children) {
436-
merge_pgs->insert(make_pair(c, q->first));
441+
merge_pgs->insert(make_pair(c, new_epoch));
437442
if (!did.count(c))
438443
queue.push_back(c);
439444
}
440-
merge_pgs->insert(make_pair(cur, q->first));
445+
merge_pgs->insert(make_pair(cur, new_epoch));
441446
}
442447
}
443448
}
444-
pgnum = q->second;
449+
pgnum = new_pgnum;
445450
}
446451
}
447452
}
@@ -8277,13 +8282,18 @@ void OSD::handle_osd_map(MOSDMap *m)
82778282
rerequest_full_maps();
82788283
}
82798284

8285+
track_pools_and_pg_num_changes(added_maps, t);
8286+
82808287
if (!superblock.maps.empty()) {
82818288
trim_maps(m->cluster_osdmap_trim_lower_bound);
82828289
pg_num_history.prune(superblock.get_oldest_map());
82838290
}
82848291
superblock.insert_osdmap_epochs(first, last);
82858292
if (superblock.maps.num_intervals() > 1) {
8286-
dout(10) << __func__ << " osd map gap " << superblock.maps << dendl;
8293+
// we had a map gap and not yet trimmed all the way up to
8294+
// cluster_osdmap_trim_lower_bound
8295+
dout(10) << __func__ << " osd maps are not contiguous"
8296+
<< superblock.maps << dendl;
82878297
}
82888298
superblock.current_epoch = last;
82898299

@@ -8294,54 +8304,6 @@ void OSD::handle_osd_map(MOSDMap *m)
82948304
superblock.clean_thru = last;
82958305
}
82968306

8297-
// check for pg_num changes and deleted pools
8298-
OSDMapRef lastmap;
8299-
for (auto& i : added_maps) {
8300-
if (!lastmap) {
8301-
if (!(lastmap = service.try_get_map(i.first - 1))) {
8302-
dout(10) << __func__ << " can't get previous map " << i.first - 1
8303-
<< " probably first start of this osd" << dendl;
8304-
continue;
8305-
}
8306-
}
8307-
ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8308-
for (auto& j : lastmap->get_pools()) {
8309-
if (!i.second->have_pg_pool(j.first)) {
8310-
pg_num_history.log_pool_delete(i.first, j.first);
8311-
dout(10) << __func__ << " recording final pg_pool_t for pool "
8312-
<< j.first << dendl;
8313-
// this information is needed by _make_pg() if have to restart before
8314-
// the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8315-
ghobject_t obj = make_final_pool_info_oid(j.first);
8316-
bufferlist bl;
8317-
encode(j.second, bl, CEPH_FEATURES_ALL);
8318-
string name = lastmap->get_pool_name(j.first);
8319-
encode(name, bl);
8320-
map<string,string> profile;
8321-
if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8322-
profile = lastmap->get_erasure_code_profile(
8323-
lastmap->get_pg_pool(j.first)->erasure_code_profile);
8324-
}
8325-
encode(profile, bl);
8326-
t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8327-
} else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8328-
new_pg_num != j.second.get_pg_num()) {
8329-
dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8330-
<< j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8331-
pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8332-
}
8333-
}
8334-
for (auto& j : i.second->get_pools()) {
8335-
if (!lastmap->have_pg_pool(j.first)) {
8336-
dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8337-
<< j.second.get_pg_num() << dendl;
8338-
pg_num_history.log_pg_num_change(i.first, j.first,
8339-
j.second.get_pg_num());
8340-
}
8341-
}
8342-
lastmap = i.second;
8343-
}
8344-
pg_num_history.epoch = last;
83458307
{
83468308
bufferlist bl;
83478309
::encode(pg_num_history, bl);
@@ -8385,6 +8347,101 @@ void OSD::handle_osd_map(MOSDMap *m)
83858347
service.publish_superblock(superblock);
83868348
}
83878349

8350+
/*
8351+
* Compare between the previous last_map we had to
8352+
* each one of the added_maps.
8353+
* Track all of the changes relevant in pg_num_history.
8354+
*/
8355+
void OSD::track_pools_and_pg_num_changes(
8356+
const map<epoch_t,OSDMapRef>& added_maps,
8357+
ObjectStore::Transaction& t)
8358+
{
8359+
epoch_t first = added_maps.begin()->first;
8360+
epoch_t last = added_maps.rbegin()->first;
8361+
8362+
// Unless this is the first start of this OSD,
8363+
// lastmap should be the newest_map we have.
8364+
OSDMapRef lastmap;
8365+
8366+
if (superblock.maps.empty()) {
8367+
dout(10) << __func__ << " no maps stored, this is probably "
8368+
<< "the first start of this osd" << dendl;
8369+
lastmap = added_maps.at(first);
8370+
} else {
8371+
if (first > superblock.get_newest_map() + 1) {
8372+
ceph_assert(first == superblock.cluster_osdmap_trim_lower_bound);
8373+
dout(20) << __func__ << " can't get previous map "
8374+
<< superblock.get_newest_map()
8375+
<< " first start of this osd after a map gap" << dendl;
8376+
}
8377+
if (!(lastmap =
8378+
service.try_get_map(superblock.get_newest_map()))) {
8379+
// This is unexpected
8380+
ceph_abort();
8381+
}
8382+
}
8383+
8384+
// For each added map, record any changes into pg_num_history
8385+
// and update lastmap afterwards.
8386+
for (auto& [current_added_map_epoch, current_added_map] : added_maps) {
8387+
_track_pools_and_pg_num_changes(t, lastmap,
8388+
current_added_map,
8389+
current_added_map_epoch);
8390+
lastmap = current_added_map;
8391+
}
8392+
pg_num_history.epoch = last;
8393+
}
8394+
8395+
void OSD::_track_pools_and_pg_num_changes(
8396+
ObjectStore::Transaction& t,
8397+
const OSDMapRef& lastmap,
8398+
const OSDMapRef& current_added_map,
8399+
epoch_t current_added_map_epoch)
8400+
{
8401+
// 1) Check if a pool was deleted
8402+
for (auto& [pool_id, pg_pool] : lastmap->get_pools()) {
8403+
if (!current_added_map->have_pg_pool(pool_id)) {
8404+
pg_num_history.log_pool_delete(current_added_map_epoch, pool_id);
8405+
dout(10) << __func__ << " recording final pg_pool_t for pool "
8406+
<< pool_id << dendl;
8407+
// this information is needed by _make_pg() if have to restart before
8408+
// the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8409+
ghobject_t obj = make_final_pool_info_oid(pool_id);
8410+
bufferlist bl;
8411+
encode(pg_pool, bl, CEPH_FEATURES_ALL);
8412+
string name = lastmap->get_pool_name(pool_id);
8413+
encode(name, bl);
8414+
map<string,string> profile;
8415+
if (lastmap->get_pg_pool(pool_id)->is_erasure()) {
8416+
profile = lastmap->get_erasure_code_profile(
8417+
lastmap->get_pg_pool(pool_id)->erasure_code_profile);
8418+
}
8419+
encode(profile, bl);
8420+
t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8421+
8422+
// 2) For existing pools, check if pg_num was changed
8423+
} else if (unsigned new_pg_num = current_added_map->get_pg_num(pool_id);
8424+
new_pg_num != pg_pool.get_pg_num()) {
8425+
dout(10) << __func__ << " recording pool " << pool_id << " pg_num "
8426+
<< pg_pool.get_pg_num() << " -> " << new_pg_num << dendl;
8427+
pg_num_history.log_pg_num_change(current_added_map_epoch,
8428+
pool_id,
8429+
new_pg_num);
8430+
}
8431+
}
8432+
8433+
// 3) Check if a pool was created
8434+
for (auto& [pool_id, pg_pool] : current_added_map->get_pools()) {
8435+
if (!lastmap->have_pg_pool(pool_id)) {
8436+
dout(10) << __func__ << " recording new pool " <<pool_id << " pg_num "
8437+
<< pg_pool.get_pg_num() << dendl;
8438+
pg_num_history.log_pg_num_change(current_added_map_epoch,
8439+
pool_id,
8440+
pg_pool.get_pg_num());
8441+
}
8442+
}
8443+
}
8444+
83888445
void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
83898446
{
83908447
dout(10) << __func__ << " " << first << ".." << last << dendl;

src/osd/OSD.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,6 +1682,12 @@ class OSD : public Dispatcher,
16821682
return osdmap ? osdmap->get_epoch() : 0;
16831683
}
16841684

1685+
/* When handling OSDMaps pg_num_history is used to
1686+
* track any changes to number of PGs of each pool
1687+
* to be used later in order to identify PG splits and merges.
1688+
* See: OSD::track_pools_and_pg_num_changes
1689+
* and OSDService::identify_splits_and_merges.
1690+
*/
16851691
pool_pg_num_history_t pg_num_history;
16861692

16871693
ceph::shared_mutex map_lock = ceph::make_shared_mutex("OSD::map_lock");
@@ -1690,6 +1696,12 @@ class OSD : public Dispatcher,
16901696
friend struct send_map_on_destruct;
16911697

16921698
void handle_osd_map(class MOSDMap *m);
1699+
void track_pools_and_pg_num_changes(const std::map<epoch_t,OSDMapRef>& added_maps,
1700+
ObjectStore::Transaction& t);
1701+
void _track_pools_and_pg_num_changes(ObjectStore::Transaction& t,
1702+
const OSDMapRef& lastmap,
1703+
const OSDMapRef& current_added_map,
1704+
epoch_t current_added_map_epoch);
16931705
void _committed_osd_maps(epoch_t first, epoch_t last, class MOSDMap *m);
16941706
void trim_maps(epoch_t oldest);
16951707
void note_down_osd(int osd);

0 commit comments

Comments
 (0)