Skip to content

Commit 9a5d55c

Browse files
committed
osd: Restrict choice of primary shard for ec_optimizations pools
Pools with ec_optimizations enabled have restrictions on which shards are permitted to become the primary because not all shards are updated for every I/O. To preserve backwards compatibility with downlevel clients pg_temp is used as the method to override the selection of primary by OSDMap. Directly changing the logic in OSDMap would have meant that all clients need to be upgraded to tentacle before using optimized EC pools, so was discounted. Using primary_temp to set the primary for an EC pool is not reliable because under error conditions an OSD can store multiple shards for the same PG and primary_temp cannot define which of these shards will be choosen. For optimized EC pools pg_temp is shuffled so that the non-primary shards are listed last. This means that the existing logic in OSDMap that picks the first available shard as the primary will avoid selecting a non-primary shard. OSDMonitor applies the shuffle when pg_temp is set, this is then reverted in PeeringState when initializing the acting set after OSDMap has selected the primary. PeeringState::choose_acting is modified to set pg_temp if OSDMap has selected a non-primary shard, this will cause a new OSDMAP to be published which will persuade OSDMap to select a primary shard instead. Signed-off-by: Bill Scales <[email protected]>
1 parent 5307f7a commit 9a5d55c

File tree

4 files changed

+106
-6
lines changed

4 files changed

+106
-6
lines changed

src/mon/OSDMonitor.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4187,8 +4187,18 @@ bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
41874187
<< ": pool has been removed" << dendl;
41884188
continue;
41894189
}
4190+
// Pools with allow_ec_optimizations set store pg_temp in a different
4191+
// order to change the primary selection algorithm without breaking
4192+
// old clients. If necessary re-order the new pg_temp now
4193+
pg_pool_t pg_pool;
4194+
if (pending_inc.new_pools.count(pool))
4195+
pg_pool = pending_inc.new_pools[pool];
4196+
else
4197+
pg_pool = *osdmap.get_pg_pool(pool);
4198+
4199+
std::vector<int> pg_temp = osdmap.pgtemp_primaryfirst(pg_pool, p->second);
41904200
pending_inc.new_pg_temp[p->first] =
4191-
mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4201+
mempool::osdmap::vector<int>(pg_temp.begin(), pg_temp.end());
41924202

41934203
// unconditionally clear pg_primary (until this message can encode
41944204
// a change for that, too.. at which point we need to also fix

src/osd/OSDMap.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2852,6 +2852,73 @@ void OSDMap::_apply_primary_affinity(ps_t seed,
28522852
}
28532853
}
28542854

2855+
/* EC pools with allow_ec_optimizations set have some shards that cannot
2856+
* become the primary because they are not updated on every I/O. To avoid
2857+
* requiring clients to be upgraded to use these new pools the logic in
2858+
* OSDMap which selects a primary cannot be changed. Instead choose_acting
2859+
* is modified to set pgtemp when it is necessary to override the choice
2860+
* of primary, and this vector is reordered so that shards that are
2861+
* permitted to be the primary are listed first. The existing OSDMap code
2862+
* will then choose a suitable shard as primary except when the pg is
2863+
* incomplete and the choice of primary doesn't matter. This function is
2864+
* called by OSDMonitor when setting pg_temp to transform the vector.
2865+
*
2866+
* Example: Optimized EC pool 4+2
2867+
* acting_set = {NONE, 6, 7, 8, 9, 10}
2868+
* non_primary_shards = {1, 2, 3} # data shards other than shard 0
2869+
* pg_temp = {NONE, 9, 10, 6, 7, 8} # non-primary shards at end
2870+
* primary will be OSD 9(1)
2871+
*/
2872+
const std::vector<int> OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
2873+
const std::vector<int>& pg_temp) const
2874+
{
2875+
// Only perform the transform for pools with allow_ec_optimizations set
2876+
if (pool.allows_ecoptimizations()) {
2877+
std::vector<int> result;
2878+
std::vector<int> nonprimary;
2879+
int shard = 0;
2880+
for (auto osd : pg_temp) {
2881+
if (pool.is_nonprimary_shard(shard_id_t(shard))) {
2882+
nonprimary.emplace_back(osd);
2883+
} else {
2884+
result.emplace_back(osd);
2885+
}
2886+
shard++;
2887+
}
2888+
result.insert(result.end(), nonprimary.begin(), nonprimary.end());
2889+
return result;
2890+
}
2891+
return pg_temp;
2892+
}
2893+
2894+
/* The function above reorders the pg_temp vector. This transformation needs
2895+
* to be reversed by OSDs (but not clients) and is called by PeeringState
2896+
* when initializing the the acting set.
2897+
*/
2898+
const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
2899+
const pg_t pg, const std::vector<int>& acting) const
2900+
{
2901+
// Only perform the transform for pools with allow_ec_optimizations set
2902+
// that also have pg_temp set
2903+
if (pool.allows_ecoptimizations()) {
2904+
if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
2905+
std::vector<int> result;
2906+
int primaryshard = 0;
2907+
int nonprimaryshard = pool.size - pool.nonprimary_shards.size();
2908+
assert(acting.size() == pool.size);
2909+
for (auto shard = 0; shard < pool.size; shard++) {
2910+
if (pool.is_nonprimary_shard(shard_id_t(shard))) {
2911+
result.emplace_back(acting[nonprimaryshard++]);
2912+
} else {
2913+
result.emplace_back(acting[primaryshard++]);
2914+
}
2915+
}
2916+
return result;
2917+
}
2918+
}
2919+
return acting;
2920+
}
2921+
28552922
void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
28562923
vector<int> *temp_pg, int *temp_primary) const
28572924
{

src/osd/OSDMap.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ class OSDMap {
590590

591591
mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
592592
mempool::osdmap::vector<osd_info_t> osd_info;
593+
// Optimized EC pools re-order pg_temp, see pgtemp_primaryfirst
593594
std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
594595
std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
595596
std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
@@ -1357,6 +1358,12 @@ class OSDMap {
13571358
return false;
13581359
}
13591360

1361+
const std::vector<int> pgtemp_primaryfirst(const pg_pool_t& pool,
1362+
const std::vector<int>& pg_temp) const;
1363+
const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
1364+
const pg_t pg,
1365+
const std::vector<int>& acting) const;
1366+
13601367
bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
13611368
auto p = removed_snaps_queue.find(pool);
13621369
if (p == removed_snaps_queue.end()) {

src/osd/PeeringState.cc

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -538,10 +538,14 @@ bool PeeringState::should_restart_peering(
538538
int newupprimary,
539539
int newactingprimary,
540540
const vector<int>& newup,
541-
const vector<int>& newacting,
541+
const vector<int>& _newacting,
542542
OSDMapRef lastmap,
543543
OSDMapRef osdmap)
544544
{
545+
const vector<int> newacting = osdmap->pgtemp_undo_primaryfirst(
546+
pool.info,
547+
info.pgid.pgid,
548+
_newacting);
545549
if (PastIntervals::is_new_interval(
546550
primary.osd,
547551
newactingprimary,
@@ -820,7 +824,9 @@ void PeeringState::init_primary_up_acting(
820824
int new_acting_primary)
821825
{
822826
actingset.clear();
823-
acting = newacting;
827+
acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info,
828+
info.pgid.pgid,
829+
newacting);
824830
for (uint8_t i = 0; i < acting.size(); ++i) {
825831
if (acting[i] != CRUSH_ITEM_NONE)
826832
actingset.insert(
@@ -2445,13 +2451,23 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
24452451
<< " from oversized want " << want << dendl;
24462452
want.pop_back();
24472453
}
2448-
if (want != acting) {
2449-
psdout(10) << "want " << want << " != acting " << acting
2454+
if ((want != acting) ||
2455+
pool.info.is_nonprimary_shard(pg_whoami.shard)) {
2456+
if (pool.info.is_nonprimary_shard(pg_whoami.shard)) {
2457+
psdout(10) << "shard " << pg_whoami.shard << " cannot be primary, want "
2458+
<< pg_vector_string(want)
2459+
<< " acting " << pg_vector_string(acting)
24502460
<< ", requesting pg_temp change" << dendl;
2461+
} else {
2462+
psdout(10) << "want " << pg_vector_string(want)
2463+
<< " != acting " << pg_vector_string(acting)
2464+
<< ", requesting pg_temp change" << dendl;
2465+
}
24512466
want_acting = want;
24522467

24532468
if (!cct->_conf->osd_debug_no_acting_change) {
2454-
if (want_acting == up) {
2469+
if ((want_acting == up) &&
2470+
!pool.info.is_nonprimary_shard(pg_whoami.shard)) {
24552471
// There can't be any pending backfill if
24562472
// want is the same as crush map up OSDs.
24572473
ceph_assert(want_backfill.empty());

0 commit comments

Comments
 (0)