Skip to content

Commit 10f802a

Browse files
authored
Merge pull request ceph#64917 from aainscow/ec_fixpack3_pr
EC fixpack 3
2 parents 5917c06 + b5cad26 commit 10f802a

File tree

15 files changed

+414
-201
lines changed

15 files changed

+414
-201
lines changed

src/crimson/osd/recovery_backend.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,9 @@ RecoveryBackend::scan_for_backfill_primary(
270270
bool added_default = false;
271271
for (auto & shard: backfill_targets) {
272272
if (shard_versions.contains(shard.shard)) {
273-
version = shard_versions.at(shard.shard);
274-
version_map->emplace(object, std::make_pair(shard.shard, version));
273+
auto shard_version = shard_versions.at(shard.shard);
274+
version_map->emplace(object, std::make_pair(shard.shard,
275+
shard_version));
275276
} else if (!added_default) {
276277
version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
277278
version));

src/osd/ECBackend.cc

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -707,13 +707,10 @@ void ECBackend::handle_sub_read_reply(
707707
rop.debug_log.emplace_back(ECUtil::ERROR, op.from, complete.buffers_read);
708708
complete.buffers_read.erase_shard(from.shard);
709709
complete.processed_read_requests.erase(from.shard);
710-
// If we are doing redundant reads, then we must take care that any failed
711-
// reads are not replaced with a zero buffer. When fast_reads are disabled,
712-
// the send_all_remaining_reads() call will replace the zeros_for_decode
713-
// based on the recovery read.
714-
if (rop.do_redundant_reads) {
715-
rop.to_read.at(hoid).zeros_for_decode.erase(from.shard);
716-
}
710+
// If there was an error for non-zero data on this shard, then we must also
711+
// ignore all zeros, or minimum_to_decode may conclude that it has enough
712+
// shards available.
713+
rop.to_read.at(hoid).zeros_for_decode.erase(from.shard);
717714
dout(20) << __func__ << " shard=" << from << " error=" << err << dendl;
718715
}
719716

src/osd/ECCommon.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,7 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter {
620620
extent_map result;
621621
if (res.r == 0) {
622622
ceph_assert(res.errors.empty());
623-
dout(20) << __func__ << ": before decode: "
623+
dout(30) << __func__ << ": before decode: "
624624
<< res.buffers_read.debug_string(2048, 0)
625625
<< dendl;
626626
/* Decode any missing buffers */
@@ -630,7 +630,7 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter {
630630
req.object_size,
631631
read_pipeline.get_parent()->get_dpp());
632632
ceph_assert( r == 0 );
633-
dout(20) << __func__ << ": after decode: "
633+
dout(30) << __func__ << ": after decode: "
634634
<< res.buffers_read.debug_string(2048, 0)
635635
<< dendl;
636636

@@ -1286,7 +1286,7 @@ void ECCommon::RecoveryBackend::handle_recovery_read_complete(
12861286
op.returned_data.emplace(std::move(res.buffers_read));
12871287
uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size);
12881288

1289-
dout(20) << __func__ << " before decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: "
1289+
dout(30) << __func__ << " before decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: "
12901290
<< op.returned_data->debug_string(2048, 0)
12911291
<< dendl;
12921292

@@ -1299,7 +1299,7 @@ void ECCommon::RecoveryBackend::handle_recovery_read_complete(
12991299
op.returned_data->erase_after_ro_offset(aligned_size);
13001300

13011301
dout(20) << __func__ << ": oid=" << op.hoid << dendl;
1302-
dout(20) << __func__ << " after decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: "
1302+
dout(30) << __func__ << " after decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: "
13031303
<< op.returned_data->debug_string(2048, 0)
13041304
<< dendl;
13051305

src/osd/ECTransaction.cc

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ void debug(const hobject_t &oid, const std::string &str,
3838
const ECUtil::shard_extent_map_t &map, DoutPrefixProvider *dpp) {
3939
ldpp_dout(dpp, 20)
4040
<< " generate_transactions: " << "oid: " << oid << " " << str << " " << map << dendl;
41-
ldpp_dout(dpp, 20)
41+
ldpp_dout(dpp, 30)
4242
<< "EC_DEBUG_BUFFERS: " << map.debug_string(2048, 0) << dendl;
4343
}
4444

@@ -213,6 +213,10 @@ ECTransaction::WritePlanObj::WritePlanObj(
213213

214214
if (pdw_write_mode != 0) {
215215
do_parity_delta_write = (pdw_write_mode == 2);
216+
} else if (pdw_read_shards.size() >= sinfo.get_k()) {
217+
// Even if recovery required for a convention RMW, PDW is not more
218+
// efficient.
219+
do_parity_delta_write = false;
216220
} else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
217221
// Some kind of reconstruct would be needed for PDW, so don't bother.
218222
do_parity_delta_write = false;
@@ -588,7 +592,7 @@ ECTransaction::Generate::Generate(PGTransaction &t,
588592
shard_written(shard_id_t(0));
589593
}
590594

591-
written_and_present_shards();
595+
written_shards();
592596

593597
if (!op.attr_updates.empty()) {
594598
attr_updates();
@@ -855,7 +859,7 @@ void ECTransaction::Generate::appends_and_clone_ranges() {
855859
}
856860
}
857861

858-
void ECTransaction::Generate::written_and_present_shards() {
862+
void ECTransaction::Generate::written_shards() {
859863
if (entry) {
860864
if (!rollback_extents.empty()) {
861865
entry->mod_desc.rollback_extents(
@@ -869,14 +873,6 @@ void ECTransaction::Generate::written_and_present_shards() {
869873
entry->written_shards.clear();
870874
written_shards_final = true;
871875
}
872-
// Calculate set of present shards
873-
for (auto &&[shard, t]: transactions) {
874-
entry->present_shards.insert(shard);
875-
}
876-
if (entry->present_shards.size() == sinfo.get_k_plus_m()) {
877-
// More efficient to encode an empty set for all shards
878-
entry->present_shards.clear();
879-
}
880876

881877
// Update shard_versions in object_info to record which shards are being
882878
// written
@@ -932,7 +928,6 @@ void ECTransaction::Generate::written_and_present_shards() {
932928
}
933929
ldpp_dout(dpp, 20) << __func__ << " shard_info: oid=" << oid
934930
<< " version=" << entry->version
935-
<< " present=" << entry->present_shards
936931
<< " written=" << entry->written_shards
937932
<< " shard_versions=" << oi.shard_versions << dendl;
938933
}

src/osd/ECTransaction.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class Generate {
106106
void truncate();
107107
void overlay_writes();
108108
void appends_and_clone_ranges();
109-
void written_and_present_shards();
109+
void written_shards();
110110
void attr_updates();
111111

112112
public:

src/osd/ECUtil.cc

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,11 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
674674
*/
675675
decode_set.insert(decode_for_parity_shards);
676676
need_set.insert(decode_for_parity_shards);
677-
extent_set decode_for_parity = get_extent_superset();
677+
extent_set decode_for_parity;
678+
679+
for (auto shard : encode_set) {
680+
decode_for_parity.insert(want.at(shard));
681+
}
678682

679683
for (auto shard : decode_for_parity_shards) {
680684
extent_set parity_pad;
@@ -983,25 +987,47 @@ bufferlist shard_extent_map_t::get_ro_buffer() const {
983987
return get_ro_buffer(ro_start, ro_end - ro_start);
984988
}
985989

990+
bool get_int_from_bufferlist(bufferlist bl, int offset, uint32_t *value) {
991+
*value = 0;
992+
993+
if (bl.length() < offset + sizeof(uint32_t)) {
994+
return false;
995+
}
996+
997+
auto bl_iter = bl.begin();
998+
bl_iter += offset;
999+
unsigned b = 0;
1000+
for (; b < sizeof(uint32_t); ++b, ++bl_iter) {
1001+
ceph_assert(bl_iter != bl.end());
1002+
*value |= (((unsigned int)bl_iter.get_current_ptr().c_str()[0]) << b);
1003+
}
1004+
return true;
1005+
}
1006+
9861007
std::string shard_extent_map_t::debug_string(uint64_t interval, uint64_t offset) const {
9871008
std::stringstream str;
9881009
str << "shard_extent_map_t: " << *this << " bufs: [";
9891010

9901011
bool s_comma = false;
9911012
for (auto &&[shard, emap] : get_extent_maps()) {
992-
if (s_comma) str << ", ";
1013+
if (s_comma) {
1014+
str << ", ";
1015+
}
9931016
s_comma = true;
9941017
str << shard << ": [";
9951018

9961019
bool comma = false;
9971020
for (auto &&extent : emap) {
9981021
bufferlist bl = extent.get_val();
999-
char *buf = bl.c_str();
1000-
for (uint64_t i = 0; i < extent.get_len(); i += interval) {
1001-
int *seed = (int*)&buf[i + offset];
1002-
if (comma) str << ", ";
1003-
str << (i + extent.get_off()) << ":" << std::to_string(*seed);
1022+
uint32_t seed;
1023+
int i = 0;
1024+
while (get_int_from_bufferlist(bl, i + offset, &seed)) {
1025+
if (comma) {
1026+
str << ", ";
1027+
}
1028+
str << (i + extent.get_off()) << ":" << seed;
10041029
comma = true;
1030+
i += interval;
10051031
}
10061032
}
10071033
str << "]";

src/osd/PGBackend.cc

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -420,22 +420,27 @@ void PGBackend::partial_write(
420420
if (entry.written_shards.empty() && info->partial_writes_last_complete.empty()) {
421421
return;
422422
}
423+
const pg_pool_t &pool = get_parent()->get_pool();
424+
if (pool.is_nonprimary_shard(get_parent()->whoami_shard().shard)) {
425+
// Don't update pwlc on nonprimary shards because they only
426+
// observe writes that update their shard
427+
return;
428+
}
423429
auto dpp = get_parent()->get_dpp();
424430
ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
425431
<< " written_shards=" << entry.written_shards
426-
<< " present_shards=" << entry.present_shards
427-
<< " pwlc=" << info->partial_writes_last_complete
432+
<< " pwlc=e" << info->partial_writes_last_complete_epoch
433+
<< ":" << info->partial_writes_last_complete
428434
<< " previous_version=" << previous_version
429435
<< dendl;
430-
const pg_pool_t &pool = get_parent()->get_pool();
431436
for (shard_id_t shard : pool.nonprimary_shards) {
432437
auto pwlc_iter = info->partial_writes_last_complete.find(shard);
433438
if (!entry.is_written_shard(shard)) {
434439
if (pwlc_iter == info->partial_writes_last_complete.end()) {
435440
// 1st partial write since all logs were updated
436441
info->partial_writes_last_complete[shard] =
437442
std::pair(previous_version, entry.version);
438-
443+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
439444
continue;
440445
}
441446
auto &&[old_v, new_v] = pwlc_iter->second;
@@ -445,7 +450,7 @@ void PGBackend::partial_write(
445450
// invalid
446451
ldpp_dout(dpp, 20) << __func__ << " pwlc invalid " << shard
447452
<< dendl;
448-
} else if (old_v.version >= entry.version.version) {
453+
} else if (old_v >= entry.version) {
449454
// Abnormal case - consider_adjusting_pwlc may advance pwlc
450455
// during peering because all shards have updates but these
451456
// have not been marked complete. At the end of peering
@@ -456,10 +461,12 @@ void PGBackend::partial_write(
456461
} else {
457462
old_v = previous_version;
458463
new_v = entry.version;
464+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
459465
}
460466
} else if (new_v == previous_version) {
461467
// Subsequent partial write, contiguous versions
462468
new_v = entry.version;
469+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
463470
} else {
464471
// Subsequent partial write, discontiguous versions
465472
ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
@@ -472,17 +479,19 @@ void PGBackend::partial_write(
472479
// shard is backfilling or in async recovery, pwlc is invalid
473480
ldpp_dout(dpp, 20) << __func__ << " pwlc invalid " << shard
474481
<< dendl;
475-
} else if (old_v.version >= entry.version.version) {
482+
} else if (old_v >= entry.version) {
476483
// Abnormal case - see above
477484
ldpp_dout(dpp, 20) << __func__ << " pwlc is ahead of entry " << shard
478485
<< dendl;
479486
} else {
480487
old_v = new_v = entry.version;
488+
info->partial_writes_last_complete_epoch = get_osdmap_epoch();
481489
}
482490
}
483491
}
484-
ldpp_dout(dpp, 20) << __func__ << " after pwlc="
485-
<< info->partial_writes_last_complete << dendl;
492+
ldpp_dout(dpp, 20) << __func__ << " after pwlc=e"
493+
<< info->partial_writes_last_complete_epoch
494+
<< ":" << info->partial_writes_last_complete << dendl;
486495
}
487496

488497
void PGBackend::remove(

0 commit comments

Comments
 (0)