Skip to content

Commit f3f45c2

Browse files
bill-scalesaainscow
authored andcommitted
osd: Optimized EC choose_acting needs to use best primary shard
There have been a couple of corner case bugs with choose_acting with optimized EC pools in the scenario where a new primary with no existing log is choosen and find_best_info selects a non-primary shard as the authorative shard. Non-primary shards don't have a full log so in this scenario we need to get the log from a shard that does have a complete log first (so our log is ahead or eqivalent to authorative shard) and then repeat the get log for the authorative shard. Problems arise if we make different decisions about the acting set and backfill/recovery based on these two different shards. In one bug we osicillated between two different primaries because one primary used one shard to making peering decisions and the other primary used the other shard, resulting in looping flip/flop changes to the acting_set. In another bug we used one shard to decide that we could do async recovery but then tried to get the log from another shard and asserted because we didn't have enough history in the log to do recovery and should have choosen to do a backfill. This change makes optimized EC pools always choose the best !non_primary shard when making decisions about peering (irrespective of whether the primary has a full log or not). The best overall shard is now only used for get log when deciding how far to rollback the log. It also sets repeat_getlog to false if peering fails because the PG is incomplete to avoid looping forever trying to get the log. Signed-off-by: Bill Scales <[email protected]>
1 parent cffd10f commit f3f45c2

File tree

2 files changed

+26
-24
lines changed

2 files changed

+26
-24
lines changed

src/osd/PeeringState.cc

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2490,7 +2490,7 @@ void PeeringState::choose_async_recovery_replicated(
24902490
* 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
24912491
* TODO!
24922492
*/
2493-
bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2493+
bool PeeringState::choose_acting(pg_shard_t &get_log_shard_id,
24942494
bool restrict_to_up_acting,
24952495
bool request_pg_temp_change_only,
24962496
bool *history_les_bound,
@@ -2507,31 +2507,32 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
25072507
}
25082508

25092509
auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
2510-
false, history_les_bound);
2511-
auto get_log_shard = auth_log_shard;
2510+
true, history_les_bound);
2511+
auto get_log_shard = find_best_info(all_info, restrict_to_up_acting,
2512+
false, history_les_bound);
25122513

25132514
if ((repeat_getlog != nullptr) &&
2514-
auth_log_shard != all_info.end() &&
2515-
(info.last_update < auth_log_shard->second.last_update) &&
2516-
pool.info.is_nonprimary_shard(auth_log_shard->first.shard)) {
2515+
get_log_shard != all_info.end() &&
2516+
(info.last_update < get_log_shard->second.last_update) &&
2517+
pool.info.is_nonprimary_shard(get_log_shard->first.shard)) {
25172518
// Only EC pools with ec_optimizations enabled:
2518-
// Our log is behind that of the auth_log_shard which is a
2519+
// Our log is behind that of the get_log_shard which is a
25192520
// non-primary shard and hence may have a sparse log,
2520-
// get a complete log from a primary shard first then
2521+
// get a complete log from the auth_log_shard first then
25212522
// repeat this step in the state machine to work out what
25222523
// has to be rolled backwards
2523-
psdout(10) << "auth_log_shard " << auth_log_shard->first
2524+
psdout(10) << "get_log_shard " << get_log_shard->first
25242525
<< " is ahead but is a non_primary shard" << dendl;
2525-
get_log_shard = find_best_info(all_info, restrict_to_up_acting,
2526-
true, history_les_bound);
2527-
if (get_log_shard != all_info.end()) {
2526+
if (auth_log_shard != all_info.end()) {
25282527
psdout(10) << "auth_log_shard " << auth_log_shard->first
25292528
<< " selected instead" << dendl;
2529+
get_log_shard = auth_log_shard;
25302530
*repeat_getlog = true;
25312531
}
25322532
}
25332533

2534-
if (get_log_shard == all_info.end()) {
2534+
if ((auth_log_shard == all_info.end()) ||
2535+
(get_log_shard == all_info.end())) {
25352536
if (up != acting) {
25362537
psdout(10) << "no suitable info found (incomplete backfills?),"
25372538
<< " reverting to up" << dendl;
@@ -2546,7 +2547,7 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
25462547
}
25472548

25482549
ceph_assert(!auth_log_shard->second.is_incomplete());
2549-
auth_log_shard_id = get_log_shard->first;
2550+
get_log_shard_id = get_log_shard->first;
25502551

25512552
set<pg_shard_t> want_backfill, want_acting_backfill;
25522553
vector<int> want;
@@ -6334,9 +6335,9 @@ PeeringState::Recovering::react(const RequestBackfill &evt)
63346335
// so pg won't have to stay undersized for long
63356336
// as backfill might take a long time to complete..
63366337
if (!ps->async_recovery_targets.empty()) {
6337-
pg_shard_t auth_log_shard;
6338+
pg_shard_t get_log_shard;
63386339
// FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6339-
ps->choose_acting(auth_log_shard, true);
6340+
ps->choose_acting(get_log_shard, true);
63406341
}
63416342
return transit<WaitLocalBackfillReserved>();
63426343
}
@@ -6391,7 +6392,7 @@ PeeringState::Recovered::Recovered(my_context ctx)
63916392
: my_base(ctx),
63926393
NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
63936394
{
6394-
pg_shard_t auth_log_shard;
6395+
pg_shard_t get_log_shard;
63956396

63966397
context< PeeringMachine >().log_enter(state_name);
63976398

@@ -6410,11 +6411,11 @@ PeeringState::Recovered::Recovered(my_context ctx)
64106411

64116412
// adjust acting set? (e.g. because backfill completed...)
64126413
if (ps->acting != ps->up &&
6413-
!ps->choose_acting(auth_log_shard, true)) {
6414+
!ps->choose_acting(get_log_shard, true)) {
64146415
ceph_assert(ps->want_acting.size());
64156416
} else if (!ps->async_recovery_targets.empty()) {
64166417
// FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6417-
ps->choose_acting(auth_log_shard, true);
6418+
ps->choose_acting(get_log_shard, true);
64186419
}
64196420

64206421
if (context< Active >().all_replicas_activated &&
@@ -6559,9 +6560,9 @@ boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
65596560
// call choose_acting again to clear them out.
65606561
// note that we leave restrict_to_up_acting to false in order to
65616562
// not overkill any chosen stray that is still alive.
6562-
pg_shard_t auth_log_shard;
6563+
pg_shard_t get_log_shard;
65636564
ps->remove_down_peer_info(advmap.osdmap);
6564-
ps->choose_acting(auth_log_shard, false, true);
6565+
ps->choose_acting(get_log_shard, false, true);
65656566
}
65666567

65676568
/* Check for changes in pool size (if the acting set changed as a result,
@@ -6639,9 +6640,9 @@ boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
66396640
}
66406641
// check if it is a previous down acting member that's coming back.
66416642
// if so, request pg_temp change to trigger a new interval transition
6642-
pg_shard_t auth_log_shard;
6643+
pg_shard_t get_log_shard;
66436644
// FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6644-
ps->choose_acting(auth_log_shard, false, true);
6645+
ps->choose_acting(get_log_shard, false, true);
66456646
if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
66466647
psdout(10) << "Active: got notify from previous acting member "
66476648
<< notevt.from << ", requesting pg_temp change"
@@ -7563,6 +7564,7 @@ PeeringState::GetLog::GetLog(my_context ctx)
75637564
// am i broken?
75647565
if (ps->info.last_update < best.log_tail) {
75657566
psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7567+
repeat_getlog = false;
75667568
post_event(IsIncomplete());
75677569
return;
75687570
}

src/osd/PeeringState.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1768,7 +1768,7 @@ class PeeringState : public MissingLoc::MappingInfo {
17681768
const OSDMapRef osdmap) const;
17691769

17701770
bool recoverable(const std::vector<int> &want) const;
1771-
bool choose_acting(pg_shard_t &auth_log_shard,
1771+
bool choose_acting(pg_shard_t &get_log_shard,
17721772
bool restrict_to_up_acting,
17731773
bool request_pg_temp_change_only = false,
17741774
bool *history_les_bound = nullptr,

0 commit comments

Comments
 (0)