Skip to content

Commit ee1b273

Browse files
authored
Merge pull request ceph#62451 from bill-scales/ec_data_structs
osd: EC optimizations: new types and additions to data structures Reviewed-by: Samuel Just <[email protected]> Reviewed-by: Radoslaw Zarzynski <[email protected]>
2 parents 7b976e6 + 88ac6d9 commit ee1b273

File tree

3 files changed

+97
-16
lines changed

3 files changed

+97
-16
lines changed

src/osd/ECTypes.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,14 @@ struct ec_align_t {
2121
uint64_t offset;
2222
uint64_t size;
2323
uint32_t flags;
24-
friend std::ostream &operator<<(std::ostream &lhs, const ec_align_t &rhs) {
25-
return lhs << rhs.offset << ","
26-
<< rhs.size << ","
27-
<< rhs.flags;
28-
}
2924
ec_align_t(std::pair<uint64_t, uint64_t> p, uint32_t flags)
3025
: offset(p.first), size(p.second), flags(flags) {}
3126
ec_align_t(uint64_t offset, uint64_t size, uint32_t flags)
3227
: offset(offset), size(size), flags(flags) {}
3328
bool operator==(const ec_align_t &other) const;
29+
void print(std::ostream &os) const {
30+
os << offset << "," << size << "," << flags;
31+
}
3432
};
3533

3634
struct raw_shard_id_t {

src/osd/osd_types.cc

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,6 +1642,7 @@ void pg_pool_t::dump(Formatter *f) const
16421642
f->dump_unsigned("stripe_width", get_stripe_width());
16431643
f->dump_unsigned("expected_num_objects", expected_num_objects);
16441644
f->dump_bool("fast_read", fast_read);
1645+
f->dump_stream("nonprimary_shards") << nonprimary_shards;
16451646
f->open_object_section("options");
16461647
opts.dump(f);
16471648
f->close_section(); // options
@@ -1961,7 +1962,7 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
19611962
return;
19621963
}
19631964

1964-
uint8_t v = 31;
1965+
uint8_t v = 32;
19651966
// NOTE: any new encoding dependencies must be reflected by
19661967
// SIGNIFICANT_FEATURES
19671968
if (!HAVE_FEATURE(features, SERVER_TENTACLE)) {
@@ -2080,12 +2081,15 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
20802081
auto maybe_peering_crush_data1 = maybe_peering_crush_data();
20812082
encode(maybe_peering_crush_data1, bl);
20822083
}
2084+
if (v >= 32) {
2085+
encode(nonprimary_shards, bl);
2086+
}
20832087
ENCODE_FINISH(bl);
20842088
}
20852089

20862090
void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
20872091
{
2088-
DECODE_START_LEGACY_COMPAT_LEN(31, 5, 5, bl);
2092+
DECODE_START_LEGACY_COMPAT_LEN(32, 5, 5, bl);
20892093
decode(type, bl);
20902094
decode(size, bl);
20912095
decode(crush_rule, bl);
@@ -2276,6 +2280,11 @@ void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
22762280
peering_crush_mandatory_member) = *peering_crush_data;
22772281
}
22782282
}
2283+
if (struct_v >= 32) {
2284+
decode(nonprimary_shards, bl);
2285+
} else {
2286+
nonprimary_shards.clear();
2287+
}
22792288
DECODE_FINISH(bl);
22802289
calc_pg_masks();
22812290
calc_grade_table();
@@ -2377,6 +2386,7 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
23772386
a.erasure_code_profile = "profile in osdmap";
23782387
a.expected_num_objects = 123456;
23792388
a.fast_read = false;
2389+
a.nonprimary_shards.clear();
23802390
a.application_metadata = {{"rbd", {{"key", "value"}}}};
23812391
o.push_back(new pg_pool_t(a));
23822392

@@ -3617,7 +3627,7 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
36173627

36183628
void pg_info_t::encode(ceph::buffer::list &bl) const
36193629
{
3620-
ENCODE_START(32, 26, bl);
3630+
ENCODE_START(33, 26, bl);
36213631
encode(pgid.pgid, bl);
36223632
encode(last_update, bl);
36233633
encode(last_complete, bl);
@@ -3633,12 +3643,13 @@ void pg_info_t::encode(ceph::buffer::list &bl) const
36333643
encode(last_backfill, bl);
36343644
encode(true, bl); // was last_backfill_bitwise
36353645
encode(last_interval_started, bl);
3646+
encode(partial_writes_last_complete, bl);
36363647
ENCODE_FINISH(bl);
36373648
}
36383649

36393650
void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
36403651
{
3641-
DECODE_START(32, bl);
3652+
DECODE_START(33, bl);
36423653
decode(pgid.pgid, bl);
36433654
decode(last_update, bl);
36443655
decode(last_complete, bl);
@@ -3667,6 +3678,9 @@ void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
36673678
} else {
36683679
last_interval_started = last_epoch_started;
36693680
}
3681+
if (struct_v >= 33) {
3682+
decode(partial_writes_last_complete, bl);
3683+
}
36703684
DECODE_FINISH(bl);
36713685
}
36723686

@@ -3681,6 +3695,16 @@ void pg_info_t::dump(Formatter *f) const
36813695
f->dump_stream("log_tail") << log_tail;
36823696
f->dump_int("last_user_version", last_user_version);
36833697
f->dump_stream("last_backfill") << last_backfill;
3698+
f->open_array_section("partial_writes_last_complete");
3699+
for (const auto & [shard, versionrange] : partial_writes_last_complete) {
3700+
auto & [from, to] = versionrange;
3701+
f->open_object_section("shard");
3702+
f->dump_int("id", int(shard));
3703+
f->dump_stream("from") << from;
3704+
f->dump_stream("to") << to;
3705+
f->close_section();
3706+
}
3707+
f->close_section();
36843708
f->open_array_section("purged_snaps");
36853709
for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
36863710
i != purged_snaps.end();
@@ -4930,7 +4954,7 @@ void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
49304954

49314955
void pg_log_entry_t::encode(ceph::buffer::list &bl) const
49324956
{
4933-
ENCODE_START(14, 4, bl);
4957+
ENCODE_START(15, 4, bl);
49344958
encode(op, bl);
49354959
encode(soid, bl);
49364960
encode(version, bl);
@@ -4963,12 +4987,14 @@ void pg_log_entry_t::encode(ceph::buffer::list &bl) const
49634987
if (op != ERROR)
49644988
encode(return_code, bl);
49654989
encode(op_returns, bl);
4990+
encode(written_shards, bl);
4991+
encode(present_shards, bl);
49664992
ENCODE_FINISH(bl);
49674993
}
49684994

49694995
void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
49704996
{
4971-
DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
4997+
DECODE_START_LEGACY_COMPAT_LEN(15, 4, 4, bl);
49724998
decode(op, bl);
49734999
if (struct_v < 2) {
49745000
sobject_t old_soid;
@@ -5034,6 +5060,10 @@ void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
50345060
}
50355061
decode(op_returns, bl);
50365062
}
5063+
if (struct_v >= 15) {
5064+
decode(written_shards, bl);
5065+
decode(present_shards, bl);
5066+
}
50375067
DECODE_FINISH(bl);
50385068
}
50395069

@@ -5083,6 +5113,8 @@ void pg_log_entry_t::dump(Formatter *f) const
50835113
f->dump_unsigned("snap", *p);
50845114
f->close_section();
50855115
}
5116+
f->dump_stream("written_shards") << written_shards;
5117+
f->dump_stream("present_shards") << present_shards;
50865118
{
50875119
f->open_object_section("mod_desc");
50885120
mod_desc.dump(f);
@@ -6407,7 +6439,7 @@ void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
64076439
for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
64086440
old_watchers.insert(make_pair(i->first.second, i->second));
64096441
}
6410-
ENCODE_START(17, 8, bl);
6442+
ENCODE_START(18, 8, bl);
64116443
encode(soid, bl);
64126444
encode(myoloc, bl); //Retained for compatibility
64136445
encode((__u32)0, bl); // was category, no longer used
@@ -6441,13 +6473,14 @@ void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
64416473
if (has_manifest()) {
64426474
encode(manifest, bl);
64436475
}
6476+
encode(shard_versions, bl);
64446477
ENCODE_FINISH(bl);
64456478
}
64466479

64476480
void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
64486481
{
64496482
object_locator_t myoloc;
6450-
DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
6483+
DECODE_START_LEGACY_COMPAT_LEN(18, 8, 8, bl);
64516484
map<entity_name_t, watch_info_t> old_watchers;
64526485
decode(soid, bl);
64536486
decode(myoloc, bl);
@@ -6533,6 +6566,9 @@ void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
65336566
decode(manifest, bl);
65346567
}
65356568
}
6569+
if (struct_v >= 18) {
6570+
decode(shard_versions, bl);
6571+
}
65366572
DECODE_FINISH(bl);
65376573
}
65386574

@@ -6572,6 +6608,14 @@ void object_info_t::dump(Formatter *f) const
65726608
f->close_section();
65736609
}
65746610
f->close_section();
6611+
f->open_array_section("shard_versions");
6612+
for (auto p = shard_versions.cbegin(); p != shard_versions.cend(); ++p) {
6613+
f->open_object_section("shard");
6614+
f->dump_int("id", int(p->first));
6615+
f->dump_stream("version") << p->second;
6616+
f->close_section();
6617+
}
6618+
f->close_section();
65756619
}
65766620

65776621
void object_info_t::generate_test_instances(list<object_info_t*>& o)

src/osd/osd_types.h

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1626,7 +1626,7 @@ struct pg_pool_t {
16261626
uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
16271627
///< user does not specify any expected value
16281628
bool fast_read = false; ///< whether turn on fast read on the pool or not
1629-
1629+
shard_id_set nonprimary_shards; ///< EC partial writes: shards that cannot become a primary
16301630
pool_opts_t opts; ///< options
16311631

16321632
typedef enum {
@@ -1931,6 +1931,11 @@ struct pg_pool_t {
19311931
/// choose a random hash position within a pg
19321932
uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
19331933

1934+
/// EC partial writes: test if a shard is a non-primary
1935+
bool is_nonprimary_shard(const shard_id_t shard) const {
1936+
return !nonprimary_shards.empty() && nonprimary_shards.contains(shard);
1937+
}
1938+
19341939
void encode(ceph::buffer::list& bl, uint64_t features) const;
19351940
void decode(ceph::buffer::list::const_iterator& bl);
19361941

@@ -3052,6 +3057,9 @@ struct pg_info_t {
30523057

30533058
interval_set<snapid_t> purged_snaps;
30543059

3060+
std::map<shard_id_t,std::pair<eversion_t, eversion_t>>
3061+
partial_writes_last_complete; ///< last_complete for shards not modified by a partial write
3062+
30553063
pg_stat_t stats;
30563064

30573065
pg_history_t history;
@@ -3068,6 +3076,7 @@ struct pg_info_t {
30683076
l.log_tail == r.log_tail &&
30693077
l.last_backfill == r.last_backfill &&
30703078
l.purged_snaps == r.purged_snaps &&
3079+
l.partial_writes_last_complete == r.partial_writes_last_complete &&
30713080
l.stats == r.stats &&
30723081
l.history == r.history &&
30733082
l.hit_set == r.hit_set;
@@ -3144,6 +3153,7 @@ struct pg_fast_info_t {
31443153
eversion_t last_update;
31453154
eversion_t last_complete;
31463155
version_t last_user_version;
3156+
std::map<shard_id_t,std::pair<eversion_t,eversion_t>> partial_writes_last_complete;
31473157
struct { // pg_stat_t stats
31483158
eversion_t version;
31493159
version_t reported_seq;
@@ -3173,6 +3183,7 @@ struct pg_fast_info_t {
31733183
last_update = info.last_update;
31743184
last_complete = info.last_complete;
31753185
last_user_version = info.last_user_version;
3186+
partial_writes_last_complete = info.partial_writes_last_complete;
31763187
stats.version = info.stats.version;
31773188
stats.reported_seq = info.stats.reported_seq;
31783189
stats.last_fresh = info.stats.last_fresh;
@@ -3199,6 +3210,7 @@ struct pg_fast_info_t {
31993210
info->last_update = last_update;
32003211
info->last_complete = last_complete;
32013212
info->last_user_version = last_user_version;
3213+
info->partial_writes_last_complete = partial_writes_last_complete;
32023214
info->stats.version = stats.version;
32033215
info->stats.reported_seq = stats.reported_seq;
32043216
info->stats.last_fresh = stats.last_fresh;
@@ -3222,7 +3234,7 @@ struct pg_fast_info_t {
32223234
}
32233235

32243236
void encode(ceph::buffer::list& bl) const {
3225-
ENCODE_START(1, 1, bl);
3237+
ENCODE_START(2, 1, bl);
32263238
encode(last_update, bl);
32273239
encode(last_complete, bl);
32283240
encode(last_user_version, bl);
@@ -3244,10 +3256,11 @@ struct pg_fast_info_t {
32443256
encode(stats.stats.sum.num_wr, bl);
32453257
encode(stats.stats.sum.num_wr_kb, bl);
32463258
encode(stats.stats.sum.num_objects_dirty, bl);
3259+
encode(partial_writes_last_complete, bl);
32473260
ENCODE_FINISH(bl);
32483261
}
32493262
void decode(ceph::buffer::list::const_iterator& p) {
3250-
DECODE_START(1, p);
3263+
DECODE_START(2, p);
32513264
decode(last_update, p);
32523265
decode(last_complete, p);
32533266
decode(last_user_version, p);
@@ -3269,12 +3282,24 @@ struct pg_fast_info_t {
32693282
decode(stats.stats.sum.num_wr, p);
32703283
decode(stats.stats.sum.num_wr_kb, p);
32713284
decode(stats.stats.sum.num_objects_dirty, p);
3285+
if (struct_v >= 2)
3286+
decode(partial_writes_last_complete, p);
32723287
DECODE_FINISH(p);
32733288
}
32743289
void dump(ceph::Formatter *f) const {
32753290
f->dump_stream("last_update") << last_update;
32763291
f->dump_stream("last_complete") << last_complete;
32773292
f->dump_stream("last_user_version") << last_user_version;
3293+
f->open_array_section("partial_writes_last_complete");
3294+
for (const auto & [shard, versionrange] : partial_writes_last_complete) {
3295+
auto & [from, to] = versionrange;
3296+
f->open_object_section("shard");
3297+
f->dump_int("id", int(shard));
3298+
f->dump_stream("from") << from;
3299+
f->dump_stream("to") << to;
3300+
f->close_section();
3301+
}
3302+
f->close_section();
32783303
f->open_object_section("stats");
32793304
f->dump_stream("version") << stats.version;
32803305
f->dump_unsigned("reported_seq", stats.reported_seq);
@@ -4438,6 +4463,9 @@ struct pg_log_entry_t {
44384463
bool invalid_pool; // only when decoding pool-less hobject based entries
44394464
ObjectCleanRegions clean_regions;
44404465

4466+
shard_id_set written_shards; // EC partial writes do not update every shard
4467+
shard_id_set present_shards; // EC partial writes need to know set of present shards
4468+
44414469
pg_log_entry_t()
44424470
: user_version(0), return_code(0), op(0),
44434471
invalid_hash(false), invalid_pool(false) {
@@ -4506,6 +4534,15 @@ struct pg_log_entry_t {
45064534
}
45074535

45084536
std::string get_key_name() const;
4537+
4538+
/// EC partial writes: test if a shard was written
4539+
bool is_written_shard(const shard_id_t shard) const {
4540+
return written_shards.empty() || written_shards.contains(shard);
4541+
}
4542+
bool is_present_shard(const shard_id_t shard) const {
4543+
return present_shards.empty() || present_shards.contains(shard);
4544+
}
4545+
45094546
void encode_with_checksum(ceph::buffer::list& bl) const;
45104547
void decode_with_checksum(ceph::buffer::list::const_iterator& p);
45114548

@@ -6057,6 +6094,8 @@ struct object_info_t {
60576094

60586095
struct object_manifest_t manifest;
60596096

6097+
std::map<shard_id_t,eversion_t> shard_versions;
6098+
60606099
void copy_user_bits(const object_info_t& other);
60616100

60626101
bool test_flag(flag_t f) const {

0 commit comments

Comments
 (0)