Skip to content

Commit d634f82

Browse files
bill-scalesaainscow
authored andcommitted
osd: Optimized EC don't apply pwlc for divergent writes
Split pwlc epoch into a separate variable so that we can use epoch and version number when comparing if last_update is within a pwlc range. This ensures that pwlc is not applied to a shard that has a divergent write, but still tracks the most recent update of pwlc. Signed-off-by: Bill Scales <[email protected]>
1 parent 880a17e commit d634f82

File tree

4 files changed

+77
-35
lines changed

4 files changed

+77
-35
lines changed

src/osd/PGBackend.cc

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -420,21 +420,27 @@ void PGBackend::partial_write(
420420
if (entry.written_shards.empty() && info->partial_writes_last_complete.empty()) {
421421
return;
422422
}
423+
const pg_pool_t &pool = get_parent()->get_pool();
424+
if (pool.is_nonprimary_shard(get_parent()->whoami_shard().shard)) {
425+
// Don't update pwlc on nonprimary shards because they only
426+
// observe writes that update their shard
427+
return;
428+
}
423429
auto dpp = get_parent()->get_dpp();
424430
ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
425431
<< " written_shards=" << entry.written_shards
426-
<< " pwlc=" << info->partial_writes_last_complete
432+
<< " pwlc=e" << info->partial_writes_last_complete_epoch
433+
<< ":" << info->partial_writes_last_complete
427434
<< " previous_version=" << previous_version
428435
<< dendl;
429-
const pg_pool_t &pool = get_parent()->get_pool();
430436
for (shard_id_t shard : pool.nonprimary_shards) {
431437
auto pwlc_iter = info->partial_writes_last_complete.find(shard);
432438
if (!entry.is_written_shard(shard)) {
433439
if (pwlc_iter == info->partial_writes_last_complete.end()) {
434440
// 1st partial write since all logs were updated
435441
info->partial_writes_last_complete[shard] =
436442
std::pair(previous_version, entry.version);
437-
443+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
438444
continue;
439445
}
440446
auto &&[old_v, new_v] = pwlc_iter->second;
@@ -444,7 +450,7 @@ void PGBackend::partial_write(
444450
// invalid
445451
ldpp_dout(dpp, 20) << __func__ << " pwlc invalid " << shard
446452
<< dendl;
447-
} else if (old_v.version >= entry.version.version) {
453+
} else if (old_v >= entry.version) {
448454
// Abnormal case - consider_adjusting_pwlc may advance pwlc
449455
// during peering because all shards have updates but these
450456
// have not been marked complete. At the end of peering
@@ -455,10 +461,12 @@ void PGBackend::partial_write(
455461
} else {
456462
old_v = previous_version;
457463
new_v = entry.version;
464+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
458465
}
459466
} else if (new_v == previous_version) {
460467
// Subsequent partial write, contiguous versions
461468
new_v = entry.version;
469+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
462470
} else {
463471
// Subsequent partial write, discontiguous versions
464472
ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
@@ -471,17 +479,19 @@ void PGBackend::partial_write(
471479
// shard is backfilling or in async recovery, pwlc is invalid
472480
ldpp_dout(dpp, 20) << __func__ << " pwlc invalid " << shard
473481
<< dendl;
474-
} else if (old_v.version >= entry.version.version) {
482+
} else if (old_v >= entry.version) {
475483
// Abnormal case - see above
476484
ldpp_dout(dpp, 20) << __func__ << " pwlc is ahead of entry " << shard
477485
<< dendl;
478486
} else {
479487
old_v = new_v = entry.version;
488+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
480489
}
481490
}
482491
}
483-
ldpp_dout(dpp, 20) << __func__ << " after pwlc="
484-
<< info->partial_writes_last_complete << dendl;
492+
ldpp_dout(dpp, 20) << __func__ << " after pwlc=e"
493+
<< info->partial_writes_last_complete_epoch
494+
<< ":" << info->partial_writes_last_complete << dendl;
485495
}
486496

487497
void PGBackend::remove(

src/osd/PeeringState.cc

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ void PeeringState::apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
331331
// knowledge of partial_writes
332332
const auto & [fromversion, toversion] = pwlc;
333333
if (toversion > info.last_update) {
334-
if (fromversion.version <= info.last_update.version) {
334+
if (fromversion <= info.last_update) {
335335
if (info.last_complete == info.last_update) {
336336
psdout(10) << "osd." << shard << " has last_complete"
337337
<< "=last_update " << info.last_update
@@ -368,8 +368,9 @@ void PeeringState::update_peer_info(const pg_shard_t &from,
368368
{
369369
// Merge pwlc information from another shard into
370370
// info.partial_writes_last_complete keeping the newest
371-
// updates
372-
if (!oinfo.partial_writes_last_complete.empty()) {
371+
// updates. Ignore pwlc from nonprimary shards.
372+
if (!oinfo.partial_writes_last_complete.empty()&&
373+
!pool.info.is_nonprimary_shard(from.shard)) {
373374
bool updated = false;
374375
// oinfo includes partial_writes_last_complete data.
375376
// Merge this with our copy keeping the most up to date versions
@@ -379,12 +380,15 @@ void PeeringState::update_peer_info(const pg_shard_t &from,
379380
if (info.partial_writes_last_complete.contains(shard)) {
380381
auto & [fromversion, toversion] =
381382
info.partial_writes_last_complete[shard];
382-
// Prefer pwlc with a newer toversion, if toversion matches prefer an
383-
// older fromversion.
384-
if ((ofromversion.epoch > fromversion.epoch) ||
385-
((ofromversion.epoch == fromversion.epoch) && (otoversion > toversion)) ||
386-
((ofromversion.epoch == fromversion.epoch) && (otoversion == toversion) &&
387-
(ofromversion.version < fromversion.version))) {
383+
// Prefer pwlc with a newer epoch, then pwlc with a newer
384+
// toversion, then pwlc with an older fromversion.
385+
bool newer_epoch = (oinfo.partial_writes_last_complete_epoch >
386+
info.partial_writes_last_complete_epoch);
387+
bool same_epoch = (oinfo.partial_writes_last_complete_epoch ==
388+
info.partial_writes_last_complete_epoch);
389+
if (newer_epoch ||
390+
(same_epoch && (otoversion > toversion)) ||
391+
(same_epoch && (otoversion == toversion) && (ofromversion < fromversion))) {
388392
if (!updated) {
389393
updated = true;
390394
psdout(10) << "osd." << from
@@ -408,6 +412,10 @@ void PeeringState::update_peer_info(const pg_shard_t &from,
408412
if (updated) {
409413
psdout(10) << "pwlc=" << info.partial_writes_last_complete << dendl;
410414
}
415+
// Update last updated epoch
416+
info.partial_writes_last_complete_epoch = std::max(
417+
info.partial_writes_last_complete_epoch,
418+
oinfo.partial_writes_last_complete_epoch);
411419
}
412420
// 3 cases:
413421
// 1. This is the primary, from is the shard that sent the oinfo which may
@@ -2756,12 +2764,14 @@ bool PeeringState::search_for_missing(
27562764
tinfo.pgid.shard = pg_whoami.shard;
27572765
// add partial write from our info
27582766
tinfo.partial_writes_last_complete = info.partial_writes_last_complete;
2767+
tinfo.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch;
27592768
if (info.partial_writes_last_complete.contains(from.shard)) {
27602769
apply_pwlc(info.partial_writes_last_complete[from.shard], from, tinfo);
27612770
}
27622771
if (!tinfo.partial_writes_last_complete.empty()) {
27632772
psdout(20) << "sending info to " << from
2764-
<< " pwlc=" << tinfo.partial_writes_last_complete
2773+
<< " pwlc=e" << tinfo.partial_writes_last_complete_epoch
2774+
<< ":" << tinfo.partial_writes_last_complete
27652775
<< " info=" << tinfo
27662776
<< dendl;
27672777
}
@@ -3020,7 +3030,8 @@ void PeeringState::activate(
30203030
<< " is up to date, queueing in pending_activators" << dendl;
30213031
if (!info.partial_writes_last_complete.empty()) {
30223032
psdout(20) << "sending info to " << peer
3023-
<< " pwlc=" << info.partial_writes_last_complete
3033+
<< " pwlc=e" << info.partial_writes_last_complete_epoch
3034+
<< ":" << info.partial_writes_last_complete
30243035
<< " info=" << info
30253036
<< dendl;
30263037
}
@@ -3057,6 +3068,7 @@ void PeeringState::activate(
30573068
<< " to " << info.last_update;
30583069

30593070
pi.partial_writes_last_complete = info.partial_writes_last_complete;
3071+
pi.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch;
30603072
pi.last_update = info.last_update;
30613073
pi.last_complete = info.last_update;
30623074
pi.set_last_backfill(hobject_t());
@@ -3336,12 +3348,10 @@ void PeeringState::consider_rollback_pwlc(eversion_t last_complete)
33363348
psdout(10) << "shard " << shard << " pwlc rolled back to "
33373349
<< info.partial_writes_last_complete[shard] << dendl;
33383350
}
3339-
// Always assign the current epoch to the version number so that
3340-
// pwlc adjustments made by the whole proc_master_log process
3341-
// are recognized as the newest updates
3342-
info.partial_writes_last_complete[shard].first.epoch =
3343-
get_osdmap_epoch();
33443351
}
3352+
// Update the epoch so that pwlc adjustments made by the whole
3353+
// proc_master_log process are recognized as the newest updates
3354+
info.partial_writes_last_complete_epoch = get_osdmap_epoch();
33453355
}
33463356

33473357
void PeeringState::proc_master_log(
@@ -3689,6 +3699,7 @@ void PeeringState::split_into(
36893699

36903700
// fix up pwlc - it may refer to log entries that are no longer in the log
36913701
child->info.partial_writes_last_complete = info.partial_writes_last_complete;
3702+
child->info.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch;
36923703
pg_log.split_pwlc(info);
36933704
child->pg_log.split_pwlc(child->info);
36943705

@@ -3857,9 +3868,10 @@ void PeeringState::merge_from(
38573868
info.partial_writes_last_complete) {
38583869
auto &&[old_v, new_v] = versionrange;
38593870
old_v = new_v = info.last_update;
3860-
old_v.epoch = get_osdmap_epoch();
38613871
}
3862-
psdout(10) << "merged pwlc=" << info.partial_writes_last_complete << dendl;
3872+
info.partial_writes_last_complete_epoch = get_osdmap_epoch();
3873+
psdout(10) << "merged pwlc=e" << info.partial_writes_last_complete_epoch
3874+
<< ":" << info.partial_writes_last_complete << dendl;
38633875
}
38643876
}
38653877

@@ -4685,6 +4697,7 @@ void PeeringState::append_log(
46854697
fromversion.version = eversion_t::max().version;
46864698
toversion = fromversion;
46874699
}
4700+
info.partial_writes_last_complete_epoch = 0;
46884701
}
46894702

46904703
for (auto p = logv.begin(); p != logv.end(); ++p) {
@@ -6949,8 +6962,10 @@ boost::statechart::result PeeringState::ReplicaActive::react(
69496962
i.history.last_epoch_started = evt.activation_epoch;
69506963
i.history.last_interval_started = i.history.same_interval_since;
69516964
if (!i.partial_writes_last_complete.empty()) {
6952-
psdout(20) << "sending info to " << ps->get_primary() << " pwlc="
6953-
<< i.partial_writes_last_complete << " info=" << i << dendl;
6965+
psdout(20) << "sending info to " << ps->get_primary() << " pwlc=e"
6966+
<< i.partial_writes_last_complete_epoch
6967+
<< ":" << i.partial_writes_last_complete
6968+
<< " info=" << i << dendl;
69546969
}
69556970
rctx.send_info(
69566971
ps->get_primary().osd,
@@ -7169,11 +7184,12 @@ boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
71697184
psdout(20) << "info from osd." << infoevt.from
71707185
<< " last_update=" << infoevt.info.last_update
71717186
<< " last_complete=" << infoevt.info.last_complete
7172-
<< " pwlc=" << pwlc
7187+
<< " pwlc=e" << infoevt.info.partial_writes_last_complete_epoch
7188+
<< ":" << pwlc
71737189
<< " our last_update=" << ps->info.last_update << dendl;
71747190
// Our last update must be in the range described by partial write
71757191
// last_complete
7176-
ceph_assert(ps->info.last_update.version >= pwlc.first.version);
7192+
ceph_assert(ps->info.last_update >= pwlc.first);
71777193
// Last complete must match the partial write last_update
71787194
ceph_assert(pwlc.second == infoevt.info.last_update);
71797195
} else {

src/osd/osd_types.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3668,7 +3668,7 @@ list<pg_history_t> pg_history_t::generate_test_instances()
36683668

36693669
void pg_info_t::encode(ceph::buffer::list &bl) const
36703670
{
3671-
ENCODE_START(33, 26, bl);
3671+
ENCODE_START(34, 26, bl);
36723672
encode(pgid.pgid, bl);
36733673
encode(last_update, bl);
36743674
encode(last_complete, bl);
@@ -3685,12 +3685,13 @@ void pg_info_t::encode(ceph::buffer::list &bl) const
36853685
encode(true, bl); // was last_backfill_bitwise
36863686
encode(last_interval_started, bl);
36873687
encode(partial_writes_last_complete, bl);
3688+
encode(partial_writes_last_complete_epoch, bl);
36883689
ENCODE_FINISH(bl);
36893690
}
36903691

36913692
void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
36923693
{
3693-
DECODE_START(33, bl);
3694+
DECODE_START(34, bl);
36943695
decode(pgid.pgid, bl);
36953696
decode(last_update, bl);
36963697
decode(last_complete, bl);
@@ -3722,6 +3723,9 @@ void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
37223723
if (struct_v >= 33) {
37233724
decode(partial_writes_last_complete, bl);
37243725
}
3726+
if (struct_v >= 34) {
3727+
decode(partial_writes_last_complete_epoch, bl);
3728+
}
37253729
DECODE_FINISH(bl);
37263730
}
37273731

@@ -3746,6 +3750,7 @@ void pg_info_t::dump(Formatter *f) const
37463750
f->close_section();
37473751
}
37483752
f->close_section();
3753+
f->dump_stream("partial_writes_last_complete_epoch") << partial_writes_last_complete_epoch;
37493754
f->open_array_section("purged_snaps");
37503755
for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
37513756
i != purged_snaps.end();

src/osd/osd_types.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3076,6 +3076,7 @@ struct pg_info_t {
30763076

30773077
std::map<shard_id_t,std::pair<eversion_t, eversion_t>>
30783078
partial_writes_last_complete; ///< last_complete for shards not modified by a partial write
3079+
epoch_t partial_writes_last_complete_epoch; ///< epoch when pwlc was last updated
30793080

30803081
pg_stat_t stats;
30813082

@@ -3094,6 +3095,7 @@ struct pg_info_t {
30943095
l.last_backfill == r.last_backfill &&
30953096
l.purged_snaps == r.purged_snaps &&
30963097
l.partial_writes_last_complete == r.partial_writes_last_complete &&
3098+
l.partial_writes_last_complete_epoch == r.partial_writes_last_complete_epoch &&
30973099
l.stats == r.stats &&
30983100
l.history == r.history &&
30993101
l.hit_set == r.hit_set;
@@ -3103,15 +3105,17 @@ struct pg_info_t {
31033105
: last_epoch_started(0),
31043106
last_interval_started(0),
31053107
last_user_version(0),
3106-
last_backfill(hobject_t::get_max())
3108+
last_backfill(hobject_t::get_max()),
3109+
partial_writes_last_complete_epoch(0)
31073110
{ }
31083111
// cppcheck-suppress noExplicitConstructor
31093112
pg_info_t(spg_t p)
31103113
: pgid(p),
31113114
last_epoch_started(0),
31123115
last_interval_started(0),
31133116
last_user_version(0),
3114-
last_backfill(hobject_t::get_max())
3117+
last_backfill(hobject_t::get_max()),
3118+
partial_writes_last_complete_epoch(0)
31153119
{ }
31163120

31173121
void set_last_backfill(hobject_t pos) {
@@ -3171,6 +3175,7 @@ struct pg_fast_info_t {
31713175
eversion_t last_complete;
31723176
version_t last_user_version;
31733177
std::map<shard_id_t,std::pair<eversion_t,eversion_t>> partial_writes_last_complete;
3178+
epoch_t partial_writes_last_complete_epoch;
31743179
struct { // pg_stat_t stats
31753180
eversion_t version;
31763181
version_t reported_seq;
@@ -3201,6 +3206,7 @@ struct pg_fast_info_t {
32013206
last_complete = info.last_complete;
32023207
last_user_version = info.last_user_version;
32033208
partial_writes_last_complete = info.partial_writes_last_complete;
3209+
partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch;
32043210
stats.version = info.stats.version;
32053211
stats.reported_seq = info.stats.reported_seq;
32063212
stats.last_fresh = info.stats.last_fresh;
@@ -3228,6 +3234,7 @@ struct pg_fast_info_t {
32283234
info->last_complete = last_complete;
32293235
info->last_user_version = last_user_version;
32303236
info->partial_writes_last_complete = partial_writes_last_complete;
3237+
info->partial_writes_last_complete_epoch = partial_writes_last_complete_epoch;
32313238
info->stats.version = stats.version;
32323239
info->stats.reported_seq = stats.reported_seq;
32333240
info->stats.last_fresh = stats.last_fresh;
@@ -3251,7 +3258,7 @@ struct pg_fast_info_t {
32513258
}
32523259

32533260
void encode(ceph::buffer::list& bl) const {
3254-
ENCODE_START(2, 1, bl);
3261+
ENCODE_START(3, 1, bl);
32553262
encode(last_update, bl);
32563263
encode(last_complete, bl);
32573264
encode(last_user_version, bl);
@@ -3274,10 +3281,11 @@ struct pg_fast_info_t {
32743281
encode(stats.stats.sum.num_wr_kb, bl);
32753282
encode(stats.stats.sum.num_objects_dirty, bl);
32763283
encode(partial_writes_last_complete, bl);
3284+
encode(partial_writes_last_complete_epoch, bl);
32773285
ENCODE_FINISH(bl);
32783286
}
32793287
void decode(ceph::buffer::list::const_iterator& p) {
3280-
DECODE_START(2, p);
3288+
DECODE_START(3, p);
32813289
decode(last_update, p);
32823290
decode(last_complete, p);
32833291
decode(last_user_version, p);
@@ -3301,6 +3309,8 @@ struct pg_fast_info_t {
33013309
decode(stats.stats.sum.num_objects_dirty, p);
33023310
if (struct_v >= 2)
33033311
decode(partial_writes_last_complete, p);
3312+
if (struct_v >= 3)
3313+
decode(partial_writes_last_complete_epoch, p);
33043314
DECODE_FINISH(p);
33053315
}
33063316
void dump(ceph::Formatter *f) const {
@@ -3317,6 +3327,7 @@ struct pg_fast_info_t {
33173327
f->close_section();
33183328
}
33193329
f->close_section();
3330+
f->dump_stream("partial_writes_last_complete_epoch") << partial_writes_last_complete_epoch;
33203331
f->open_object_section("stats");
33213332
f->dump_stream("version") << stats.version;
33223333
f->dump_unsigned("reported_seq", stats.reported_seq);

0 commit comments

Comments
 (0)