Skip to content

Commit f910e9d

Browse files
authored
Merge pull request ceph#58250 from cyx1231st/wip-seastore-optimize-ool-padding
crimson/os/seastore: write ool extents without padding Reviewed-by: Samuel Just <[email protected]>
2 parents 9cf1828 + 35085b9 commit f910e9d

File tree

5 files changed

+109
-18
lines changed

5 files changed

+109
-18
lines changed

src/crimson/os/seastore/cache.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1136,7 +1136,7 @@ record_t Cache::prepare_record(
11361136
t.read_set.clear();
11371137
t.write_set.clear();
11381138

1139-
record_t record(trans_src);
1139+
record_t record(record_type_t::JOURNAL, trans_src);
11401140
auto commit_time = seastar::lowres_system_clock::now();
11411141

11421142
// Add new copy of mutated blocks, set_io_wait to block until written

src/crimson/os/seastore/extent_placement_manager.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ SegmentedOolWriter::do_write(
8484
return do_write(t, extents);
8585
});
8686
}
87-
record_t record(t.get_src());
87+
record_t record(record_type_t::OOL, t.get_src());
8888
std::list<LogicalCachedExtentRef> pending_extents;
8989
auto commit_time = seastar::lowres_system_clock::now();
9090

src/crimson/os/seastore/journal/record_submitter.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ void RecordSubmitter::account_submission(
493493
stats.record_batch_stats.increment(rg.get_size());
494494

495495
for (const record_t& r : rg.records) {
496-
auto src = r.type;
496+
auto src = r.trans_type;
497497
assert(is_modify_transaction(src));
498498
auto& trans_stats = get_by_src(stats.stats_by_src, src);
499499
++(trans_stats.num_records);

src/crimson/os/seastore/seastore_types.cc

Lines changed: 65 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -388,20 +388,32 @@ std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail)
388388

389389
extent_len_t record_size_t::get_raw_mdlength() const
390390
{
391+
assert(record_type < record_type_t::MAX);
391392
// empty record is allowed to submit
392-
return plain_mdlength +
393-
ceph::encoded_sizeof_bounded<record_header_t>();
393+
extent_len_t ret = plain_mdlength;
394+
if (record_type == record_type_t::JOURNAL) {
395+
ret += ceph::encoded_sizeof_bounded<record_header_t>();
396+
} else {
397+
// OOL won't contain metadata
398+
assert(ret == 0);
399+
}
400+
return ret;
394401
}
395402

396403
void record_size_t::account_extent(extent_len_t extent_len)
397404
{
398405
assert(extent_len);
399-
plain_mdlength += ceph::encoded_sizeof_bounded<extent_info_t>();
406+
if (record_type == record_type_t::JOURNAL) {
407+
plain_mdlength += ceph::encoded_sizeof_bounded<extent_info_t>();
408+
} else {
409+
// OOL won't contain metadata
410+
}
400411
dlength += extent_len;
401412
}
402413

403414
void record_size_t::account(const delta_info_t& delta)
404415
{
416+
assert(record_type == record_type_t::JOURNAL);
405417
assert(delta.bl.length());
406418
plain_mdlength += ceph::encoded_sizeof(delta);
407419
}
@@ -433,15 +445,32 @@ std::ostream &operator<<(std::ostream &os, transaction_type_t type)
433445
std::ostream &operator<<(std::ostream& out, const record_size_t& rsize)
434446
{
435447
return out << "record_size_t("
448+
<< "record_type=" << rsize.record_type
436449
<< "raw_md=" << rsize.get_raw_mdlength()
437450
<< ", data=" << rsize.dlength
438451
<< ")";
439452
}
440453

454+
std::ostream &operator<<(std::ostream& out, const record_type_t& type)
455+
{
456+
switch (type) {
457+
case record_type_t::JOURNAL:
458+
return out << "JOURNAL";
459+
case record_type_t::OOL:
460+
return out << "OOL";
461+
case record_type_t::MAX:
462+
return out << "NULL";
463+
default:
464+
return out << "INVALID_RECORD_TYPE("
465+
<< static_cast<std::size_t>(type)
466+
<< ")";
467+
}
468+
}
469+
441470
std::ostream &operator<<(std::ostream& out, const record_t& r)
442471
{
443472
return out << "record_t("
444-
<< "type=" << r.type
473+
<< "trans_type=" << r.trans_type
445474
<< ", num_extents=" << r.extents.size()
446475
<< ", num_deltas=" << r.deltas.size()
447476
<< ", modify_time=" << sea_time_point_printer_t{r.modify_time}
@@ -472,9 +501,16 @@ std::ostream& operator<<(std::ostream& out, const record_group_header_t& h)
472501

473502
extent_len_t record_group_size_t::get_raw_mdlength() const
474503
{
475-
return plain_mdlength +
476-
sizeof(checksum_t) +
477-
ceph::encoded_sizeof_bounded<record_group_header_t>();
504+
assert(record_type < record_type_t::MAX);
505+
extent_len_t ret = plain_mdlength;
506+
if (record_type == record_type_t::JOURNAL) {
507+
ret += sizeof(checksum_t);
508+
ret += ceph::encoded_sizeof_bounded<record_group_header_t>();
509+
} else {
510+
// OOL won't contain metadata
511+
assert(ret == 0);
512+
}
513+
return ret;
478514
}
479515

480516
void record_group_size_t::account(
@@ -485,14 +521,23 @@ void record_group_size_t::account(
485521
assert(_block_size > 0);
486522
assert(rsize.dlength % _block_size == 0);
487523
assert(block_size == 0 || block_size == _block_size);
488-
plain_mdlength += rsize.get_raw_mdlength();
489-
dlength += rsize.dlength;
524+
assert(record_type == RECORD_TYPE_NULL ||
525+
record_type == rsize.record_type);
490526
block_size = _block_size;
527+
record_type = rsize.record_type;
528+
if (record_type == record_type_t::JOURNAL) {
529+
plain_mdlength += rsize.get_raw_mdlength();
530+
} else {
531+
// OOL won't contain metadata
532+
assert(rsize.get_raw_mdlength() == 0);
533+
}
534+
dlength += rsize.dlength;
491535
}
492536

493537
std::ostream& operator<<(std::ostream& out, const record_group_size_t& size)
494538
{
495539
return out << "record_group_size_t("
540+
<< "record_type=" << size.record_type
496541
<< "raw_md=" << size.get_raw_mdlength()
497542
<< ", data=" << size.dlength
498543
<< ", block_size=" << size.block_size
@@ -526,6 +571,7 @@ ceph::bufferlist encode_records(
526571
const journal_seq_t& committed_to,
527572
segment_nonce_t current_segment_nonce)
528573
{
574+
assert(record_group.size.record_type < record_type_t::MAX);
529575
assert(record_group.size.block_size > 0);
530576
assert(record_group.records.size() > 0);
531577

@@ -537,6 +583,15 @@ ceph::bufferlist encode_records(
537583
}
538584
}
539585

586+
if (record_group.size.record_type == record_type_t::OOL) {
587+
// OOL won't contain metadata
588+
assert(record_group.size.get_mdlength() == 0);
589+
ceph_assert(data_bl.length() ==
590+
record_group.size.get_encoded_length());
591+
record_group.clear();
592+
return data_bl;
593+
}
594+
// JOURNAL
540595
bufferlist bl;
541596
record_group_header_t header{
542597
static_cast<extent_len_t>(record_group.records.size()),
@@ -552,7 +607,7 @@ ceph::bufferlist encode_records(
552607

553608
for (auto& r: record_group.records) {
554609
record_header_t rheader{
555-
r.type,
610+
r.trans_type,
556611
(extent_len_t)r.deltas.size(),
557612
(extent_len_t)r.extents.size(),
558613
timepoint_to_mod(r.modify_time)

src/crimson/os/seastore/seastore_types.h

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1893,7 +1893,20 @@ constexpr bool is_modify_transaction(transaction_type_t type) {
18931893
is_background_transaction(type));
18941894
}
18951895

1896+
// Note: It is possible to statically introduce structs for OOL, which must be
1897+
// more efficient, but that requires to specialize the RecordSubmitter as well.
1898+
// Let's delay this optimization until necessary.
1899+
enum class record_type_t {
1900+
JOURNAL = 0,
1901+
OOL, // no header, no metadata, so no padding
1902+
MAX
1903+
};
1904+
std::ostream &operator<<(std::ostream&, const record_type_t&);
1905+
1906+
static constexpr auto RECORD_TYPE_NULL = record_type_t::MAX;
1907+
18961908
struct record_size_t {
1909+
record_type_t record_type = RECORD_TYPE_NULL; // must not be NULL in use
18971910
extent_len_t plain_mdlength = 0; // mdlength without the record header
18981911
extent_len_t dlength = 0;
18991912

@@ -1917,30 +1930,37 @@ struct record_size_t {
19171930
std::ostream &operator<<(std::ostream&, const record_size_t&);
19181931

19191932
struct record_t {
1920-
transaction_type_t type = TRANSACTION_TYPE_NULL;
1933+
transaction_type_t trans_type = TRANSACTION_TYPE_NULL;
19211934
std::vector<extent_t> extents;
19221935
std::vector<delta_info_t> deltas;
19231936
record_size_t size;
19241937
sea_time_point modify_time = NULL_TIME;
19251938

1926-
record_t(transaction_type_t type) : type{type} { }
1939+
record_t(record_type_t r_type,
1940+
transaction_type_t t_type)
1941+
: trans_type{t_type} {
1942+
assert(r_type != RECORD_TYPE_NULL);
1943+
size.record_type = r_type;
1944+
}
19271945

19281946
// unit test only
19291947
record_t() {
1930-
type = transaction_type_t::MUTATE;
1948+
trans_type = transaction_type_t::MUTATE;
1949+
size.record_type = record_type_t::JOURNAL;
19311950
}
19321951

19331952
// unit test only
19341953
record_t(std::vector<extent_t>&& _extents,
19351954
std::vector<delta_info_t>&& _deltas) {
1955+
trans_type = transaction_type_t::MUTATE;
1956+
size.record_type = record_type_t::JOURNAL;
19361957
auto modify_time = seastar::lowres_system_clock::now();
19371958
for (auto& e: _extents) {
19381959
push_back(std::move(e), modify_time);
19391960
}
19401961
for (auto& d: _deltas) {
19411962
push_back(std::move(d));
19421963
}
1943-
type = transaction_type_t::MUTATE;
19441964
}
19451965

19461966
bool is_empty() const {
@@ -1949,6 +1969,13 @@ struct record_t {
19491969
}
19501970

19511971
std::size_t get_delta_size() const {
1972+
assert(size.record_type < record_type_t::MAX);
1973+
if (size.record_type == record_type_t::OOL) {
1974+
// OOL won't contain metadata
1975+
assert(deltas.size() == 0);
1976+
return 0;
1977+
}
1978+
// JOURNAL
19521979
auto delta_size = std::accumulate(
19531980
deltas.begin(), deltas.end(), 0,
19541981
[](uint64_t sum, auto& delta) {
@@ -2018,6 +2045,7 @@ struct record_group_header_t {
20182045
std::ostream& operator<<(std::ostream&, const record_group_header_t&);
20192046

20202047
struct record_group_size_t {
2048+
record_type_t record_type = RECORD_TYPE_NULL; // must not be NULL in use
20212049
extent_len_t plain_mdlength = 0; // mdlength without the group header
20222050
extent_len_t dlength = 0;
20232051
extent_len_t block_size = 0;
@@ -2033,7 +2061,14 @@ struct record_group_size_t {
20332061

20342062
extent_len_t get_mdlength() const {
20352063
assert(block_size > 0);
2036-
return p2roundup(get_raw_mdlength(), block_size);
2064+
assert(record_type < record_type_t::MAX);
2065+
if (record_type == record_type_t::JOURNAL) {
2066+
return p2roundup(get_raw_mdlength(), block_size);
2067+
} else {
2068+
// OOL won't contain metadata
2069+
assert(get_raw_mdlength() == 0);
2070+
return 0;
2071+
}
20372072
}
20382073

20392074
extent_len_t get_encoded_length() const {
@@ -2401,6 +2436,7 @@ template <> struct fmt::formatter<crimson::os::seastore::record_group_header_t>
24012436
template <> struct fmt::formatter<crimson::os::seastore::record_group_size_t> : fmt::ostream_formatter {};
24022437
template <> struct fmt::formatter<crimson::os::seastore::record_header_t> : fmt::ostream_formatter {};
24032438
template <> struct fmt::formatter<crimson::os::seastore::record_locator_t> : fmt::ostream_formatter {};
2439+
template <> struct fmt::formatter<crimson::os::seastore::record_type_t> : fmt::ostream_formatter {};
24042440
template <> struct fmt::formatter<crimson::os::seastore::record_t> : fmt::ostream_formatter {};
24052441
template <> struct fmt::formatter<crimson::os::seastore::rewrite_gen_printer_t> : fmt::ostream_formatter {};
24062442
template <> struct fmt::formatter<crimson::os::seastore::scan_valid_records_cursor> : fmt::ostream_formatter {};

0 commit comments

Comments
 (0)