Skip to content

Commit 0da9145

Browse files
authored
Merge pull request ceph#62490 from bill-scales/ec_select_primary
osd: Restrict choice of primary shard for ec_optimizations pools
2 parents 83890f9 + 7dc6012 commit 0da9145

File tree

5 files changed

+178
-6
lines changed

5 files changed

+178
-6
lines changed

src/mon/OSDMonitor.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4187,8 +4187,18 @@ bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
41874187
<< ": pool has been removed" << dendl;
41884188
continue;
41894189
}
4190+
// Pools with allow_ec_optimizations set store pg_temp in a different
4191+
// order to change the primary selection algorithm without breaking
4192+
// old clients. If necessary re-order the new pg_temp now
4193+
pg_pool_t pg_pool;
4194+
if (pending_inc.new_pools.count(pool))
4195+
pg_pool = pending_inc.new_pools[pool];
4196+
else
4197+
pg_pool = *osdmap.get_pg_pool(pool);
4198+
4199+
std::vector<int> pg_temp = osdmap.pgtemp_primaryfirst(pg_pool, p->second);
41904200
pending_inc.new_pg_temp[p->first] =
4191-
mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4201+
mempool::osdmap::vector<int>(pg_temp.begin(), pg_temp.end());
41924202

41934203
// unconditionally clear pg_primary (until this message can encode
41944204
// a change for that, too.. at which point we need to also fix

src/osd/OSDMap.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2852,6 +2852,73 @@ void OSDMap::_apply_primary_affinity(ps_t seed,
28522852
}
28532853
}
28542854

2855+
/* EC pools with allow_ec_optimizations set have some shards that cannot
2856+
* become the primary because they are not updated on every I/O. To avoid
2857+
* requiring clients to be upgraded to use these new pools the logic in
2858+
* OSDMap which selects a primary cannot be changed. Instead choose_acting
2859+
* is modified to set pgtemp when it is necessary to override the choice
2860+
* of primary, and this vector is reordered so that shards that are
2861+
* permitted to be the primary are listed first. The existing OSDMap code
2862+
* will then choose a suitable shard as primary except when the pg is
2863+
* incomplete and the choice of primary doesn't matter. This function is
2864+
* called by OSDMonitor when setting pg_temp to transform the vector.
2865+
*
2866+
* Example: Optimized EC pool 4+2
2867+
* acting_set = {NONE, 6, 7, 8, 9, 10}
2868+
* non_primary_shards = {1, 2, 3} # data shards other than shard 0
2869+
* pg_temp = {NONE, 9, 10, 6, 7, 8} # non-primary shards at end
2870+
* primary will be OSD 9(1)
2871+
*/
2872+
const std::vector<int> OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
2873+
const std::vector<int>& pg_temp) const
2874+
{
2875+
// Only perform the transform for pools with allow_ec_optimizations set
2876+
if (pool.allows_ecoptimizations()) {
2877+
std::vector<int> result;
2878+
std::vector<int> nonprimary;
2879+
int shard = 0;
2880+
for (auto osd : pg_temp) {
2881+
if (pool.is_nonprimary_shard(shard_id_t(shard))) {
2882+
nonprimary.emplace_back(osd);
2883+
} else {
2884+
result.emplace_back(osd);
2885+
}
2886+
shard++;
2887+
}
2888+
result.insert(result.end(), nonprimary.begin(), nonprimary.end());
2889+
return result;
2890+
}
2891+
return pg_temp;
2892+
}
2893+
2894+
/* The function above reorders the pg_temp vector. This transformation needs
2895+
* to be reversed by OSDs (but not clients) and is called by PeeringState
2896+
* when initializing the the acting set.
2897+
*/
2898+
const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
2899+
const pg_t pg, const std::vector<int>& acting) const
2900+
{
2901+
// Only perform the transform for pools with allow_ec_optimizations set
2902+
// that also have pg_temp set
2903+
if (pool.allows_ecoptimizations()) {
2904+
if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
2905+
std::vector<int> result;
2906+
int primaryshard = 0;
2907+
int nonprimaryshard = pool.size - pool.nonprimary_shards.size();
2908+
assert(acting.size() == pool.size);
2909+
for (auto shard = 0; shard < pool.size; shard++) {
2910+
if (pool.is_nonprimary_shard(shard_id_t(shard))) {
2911+
result.emplace_back(acting[nonprimaryshard++]);
2912+
} else {
2913+
result.emplace_back(acting[primaryshard++]);
2914+
}
2915+
}
2916+
return result;
2917+
}
2918+
}
2919+
return acting;
2920+
}
2921+
28552922
void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
28562923
vector<int> *temp_pg, int *temp_primary) const
28572924
{

src/osd/OSDMap.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ class OSDMap {
590590

591591
mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
592592
mempool::osdmap::vector<osd_info_t> osd_info;
593+
// Optimized EC pools re-order pg_temp, see pgtemp_primaryfirst
593594
std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
594595
std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
595596
std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
@@ -1357,6 +1358,12 @@ class OSDMap {
13571358
return false;
13581359
}
13591360

1361+
const std::vector<int> pgtemp_primaryfirst(const pg_pool_t& pool,
1362+
const std::vector<int>& pg_temp) const;
1363+
const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
1364+
const pg_t pg,
1365+
const std::vector<int>& acting) const;
1366+
13601367
bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
13611368
auto p = removed_snaps_queue.find(pool);
13621369
if (p == removed_snaps_queue.end()) {

src/osd/PeeringState.cc

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -538,10 +538,14 @@ bool PeeringState::should_restart_peering(
538538
int newupprimary,
539539
int newactingprimary,
540540
const vector<int>& newup,
541-
const vector<int>& newacting,
541+
const vector<int>& _newacting,
542542
OSDMapRef lastmap,
543543
OSDMapRef osdmap)
544544
{
545+
const vector<int> newacting = osdmap->pgtemp_undo_primaryfirst(
546+
pool.info,
547+
info.pgid.pgid,
548+
_newacting);
545549
if (PastIntervals::is_new_interval(
546550
primary.osd,
547551
newactingprimary,
@@ -820,7 +824,9 @@ void PeeringState::init_primary_up_acting(
820824
int new_acting_primary)
821825
{
822826
actingset.clear();
823-
acting = newacting;
827+
acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info,
828+
info.pgid.pgid,
829+
newacting);
824830
for (uint8_t i = 0; i < acting.size(); ++i) {
825831
if (acting[i] != CRUSH_ITEM_NONE)
826832
actingset.insert(
@@ -2445,13 +2451,23 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
24452451
<< " from oversized want " << want << dendl;
24462452
want.pop_back();
24472453
}
2448-
if (want != acting) {
2449-
psdout(10) << "want " << want << " != acting " << acting
2454+
if ((want != acting) ||
2455+
pool.info.is_nonprimary_shard(pg_whoami.shard)) {
2456+
if (pool.info.is_nonprimary_shard(pg_whoami.shard)) {
2457+
psdout(10) << "shard " << pg_whoami.shard << " cannot be primary, want "
2458+
<< pg_vector_string(want)
2459+
<< " acting " << pg_vector_string(acting)
24502460
<< ", requesting pg_temp change" << dendl;
2461+
} else {
2462+
psdout(10) << "want " << pg_vector_string(want)
2463+
<< " != acting " << pg_vector_string(acting)
2464+
<< ", requesting pg_temp change" << dendl;
2465+
}
24512466
want_acting = want;
24522467

24532468
if (!cct->_conf->osd_debug_no_acting_change) {
2454-
if (want_acting == up) {
2469+
if ((want_acting == up) &&
2470+
!pool.info.is_nonprimary_shard(pg_whoami.shard)) {
24552471
// There can't be any pending backfill if
24562472
// want is the same as crush map up OSDs.
24572473
ceph_assert(want_backfill.empty());

src/test/osd/TestOSDMap.cc

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3016,6 +3016,78 @@ TEST_F(OSDMapTest, rb_osdsize_opt_score) {
30163016
return;
30173017
}
30183018

3019+
// Test pgtemp_primaryfirst and pgtemp_unfo_primaryfirst transforms
3020+
TEST_F(OSDMapTest, pgtemp_primaryfirst) {
3021+
set_up_map();
3022+
3023+
pg_pool_t pool;
3024+
pool.size = 6;
3025+
3026+
vector<int> set= { 0, 1, 2, 3, 4, 5 };
3027+
vector<int> encoded;
3028+
vector<int> decoded;
3029+
3030+
pg_t rawpg(0, my_ec_pool);
3031+
pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
3032+
3033+
// Pool without EC optimizations, no pg_temp
3034+
encoded = osdmap.pgtemp_primaryfirst(pool, set);
3035+
ASSERT_EQ(set, encoded); // no change expected
3036+
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
3037+
ASSERT_EQ(set, decoded);
3038+
3039+
// Pool with EC optimizations, no pg_temp
3040+
pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
3041+
encoded = osdmap.pgtemp_primaryfirst(pool, set);
3042+
ASSERT_EQ(set, encoded); // no change expected
3043+
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
3044+
ASSERT_EQ(set, decoded);
3045+
3046+
// Pool without EC optimizations, with pg_temp
3047+
pool.unset_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
3048+
OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
3049+
pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
3050+
set.begin(), set.end());
3051+
osdmap.apply_incremental(pgtemp_map);
3052+
3053+
encoded = osdmap.pgtemp_primaryfirst(pool, set);
3054+
ASSERT_EQ(set, encoded); // no change expected
3055+
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
3056+
ASSERT_EQ(set, decoded);
3057+
3058+
// Pool with EC optimizations, with pg_temp
3059+
// No nonprimary_shards
3060+
pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
3061+
encoded = osdmap.pgtemp_primaryfirst(pool, set);
3062+
ASSERT_EQ(set, encoded); // no change expected
3063+
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
3064+
ASSERT_EQ(set, decoded);
3065+
3066+
// With nonprimary_shards
3067+
for (int seed = 1; seed < 64; seed++) {
3068+
for (int osd = 0; osd < 6; osd++ ) {
3069+
if (seed & (1 << osd)) {
3070+
pool.nonprimary_shards.insert(shard_id_t(osd));
3071+
} else {
3072+
pool.nonprimary_shards.erase(shard_id_t(osd));
3073+
}
3074+
}
3075+
ASSERT_TRUE(pool.nonprimary_shards.size() > 0);
3076+
encoded = osdmap.pgtemp_primaryfirst(pool, set);
3077+
for (size_t osd = 0; osd < 6; osd++ ) {
3078+
if (osd < pool.size - pool.nonprimary_shards.size() ) {
3079+
// primary shards first
3080+
ASSERT_FALSE(pool.is_nonprimary_shard(shard_id_t(encoded[osd])));
3081+
} else {
3082+
// non-primary shards last
3083+
ASSERT_TRUE(pool.is_nonprimary_shard(shard_id_t(encoded[osd])));
3084+
}
3085+
}
3086+
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
3087+
ASSERT_EQ(set, decoded);
3088+
}
3089+
}
3090+
30193091
INSTANTIATE_TEST_SUITE_P(
30203092
OSDMap,
30213093
OSDMapTest,

0 commit comments

Comments
 (0)