Skip to content

Commit 4cb1243

Browse files
committed
crimson/os/seastore: move the root meta out of the root block
During massive data backfilling, new osdmaps keep being created due to frequent pg status changes, which can lead to frequent osd meta updates. Those updates will be translated into "SeaStore::write_meta"s, which modifies the root block's meta field and invalidates all inflight transactions. Since the osd meta updates can be very frequent, long transactions may be kept invalidated and the corresponding IO requests hang. This commit moves the root meta out of the root block, so that updates to it won't invalidate irrelevant transactions Signed-off-by: Xuehan Xu <[email protected]>
1 parent 66e6a86 commit 4cb1243

File tree

6 files changed

+153
-51
lines changed

6 files changed

+153
-51
lines changed

src/crimson/os/seastore/cache.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ void Cache::register_metrics()
172172
{extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")},
173173
{extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")},
174174
{extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")},
175+
{extent_types_t::ROOT_META, sm::label_instance("ext", "ROOT_META")},
175176
{extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")},
176177
{extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")},
177178
{extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
@@ -1093,6 +1094,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
10931094
case extent_types_t::LADDR_LEAF:
10941095
return alloc_new_non_data_extent<lba_manager::btree::LBALeafNode>(
10951096
t, length, hint, gen);
1097+
case extent_types_t::ROOT_META:
1098+
return alloc_new_non_data_extent<RootMetaBlock>(
1099+
t, length, hint, gen);
10961100
case extent_types_t::ONODE_BLOCK_STAGED:
10971101
return alloc_new_non_data_extent<onode::SeastoreNodeExtent>(
10981102
t, length, hint, gen);
@@ -2193,6 +2197,12 @@ Cache::do_get_caching_extent_by_type(
21932197
).safe_then([](auto extent) {
21942198
return CachedExtentRef(extent.detach(), false /* add_ref */);
21952199
});
2200+
case extent_types_t::ROOT_META:
2201+
return do_get_caching_extent<RootMetaBlock>(
2202+
offset, length, std::move(extent_init_func), std::move(on_cache)
2203+
).safe_then([](auto extent) {
2204+
return CachedExtentRef(extent.detach(), false /* add_ref */);
2205+
});
21962206
case extent_types_t::OMAP_INNER:
21972207
return do_get_caching_extent<omap_manager::OMapInnerNode>(
21982208
offset, length, std::move(extent_init_func), std::move(on_cache)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2+
// vim: ts=8 sw=2 smarttab
3+
4+
#pragma once
5+
6+
#include "crimson/os/seastore/cached_extent.h"
7+
8+
namespace crimson::os::seastore {
9+
10+
struct RootMetaBlock : LogicalCachedExtent {
11+
using meta_t = std::map<std::string, std::string>;
12+
using Ref = TCachedExtentRef<RootMetaBlock>;
13+
static constexpr size_t SIZE = 4096;
14+
static constexpr int MAX_META_LENGTH = 1024;
15+
16+
explicit RootMetaBlock(ceph::bufferptr &&ptr)
17+
: LogicalCachedExtent(std::move(ptr)) {}
18+
explicit RootMetaBlock(extent_len_t length)
19+
: LogicalCachedExtent(length) {}
20+
RootMetaBlock(const RootMetaBlock &rhs)
21+
: LogicalCachedExtent(rhs) {}
22+
23+
CachedExtentRef duplicate_for_write(Transaction&) final {
24+
return CachedExtentRef(new RootMetaBlock(*this));
25+
}
26+
27+
static constexpr extent_types_t TYPE = extent_types_t::ROOT_META;
28+
extent_types_t get_type() const final {
29+
return extent_types_t::ROOT_META;
30+
}
31+
32+
/// dumps root meta as delta
33+
ceph::bufferlist get_delta() final {
34+
ceph::bufferlist bl;
35+
ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH);
36+
bl.append(bptr);
37+
return bl;
38+
}
39+
40+
/// overwrites root
41+
void apply_delta(const ceph::bufferlist &_bl) final
42+
{
43+
assert(_bl.length() == MAX_META_LENGTH);
44+
ceph::bufferlist bl = _bl;
45+
bl.rebuild();
46+
get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str());
47+
}
48+
49+
meta_t get_meta() const {
50+
bufferlist bl;
51+
bl.append(get_bptr());
52+
meta_t ret;
53+
auto iter = bl.cbegin();
54+
decode(ret, iter);
55+
return ret;
56+
}
57+
58+
void set_meta(const meta_t &m) {
59+
ceph::bufferlist bl;
60+
encode(m, bl);
61+
ceph_assert(bl.length() <= MAX_META_LENGTH);
62+
bl.rebuild();
63+
get_bptr().zero(0, MAX_META_LENGTH);
64+
get_bptr().copy_in(0, bl.length(), bl.front().c_str());
65+
}
66+
67+
};
68+
using RootMetaBlockRef = RootMetaBlock::Ref;
69+
70+
} // crimson::os::seastore
71+
72+
73+
#if FMT_VERSION >= 90000
74+
template <> struct fmt::formatter<crimson::os::seastore::RootMetaBlock>
75+
: fmt::ostream_formatter {};
76+
#endif

src/crimson/os/seastore/seastore_types.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
246246
return out << "LADDR_LEAF";
247247
case extent_types_t::ONODE_BLOCK_STAGED:
248248
return out << "ONODE_BLOCK_STAGED";
249+
case extent_types_t::ROOT_META:
250+
return out << "ROOT_META";
249251
case extent_types_t::OMAP_INNER:
250252
return out << "OMAP_INNER";
251253
case extent_types_t::OMAP_LEAF:

src/crimson/os/seastore/seastore_types.h

Lines changed: 18 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,23 +1378,24 @@ enum class extent_types_t : uint8_t {
13781378
LADDR_INTERNAL = 1,
13791379
LADDR_LEAF = 2,
13801380
DINK_LADDR_LEAF = 3, // should only be used for unitttests
1381-
OMAP_INNER = 4,
1382-
OMAP_LEAF = 5,
1383-
ONODE_BLOCK_STAGED = 6,
1384-
COLL_BLOCK = 7,
1385-
OBJECT_DATA_BLOCK = 8,
1386-
RETIRED_PLACEHOLDER = 9,
1381+
ROOT_META = 4,
1382+
OMAP_INNER = 5,
1383+
OMAP_LEAF = 6,
1384+
ONODE_BLOCK_STAGED = 7,
1385+
COLL_BLOCK = 8,
1386+
OBJECT_DATA_BLOCK = 9,
1387+
RETIRED_PLACEHOLDER = 10,
13871388
// the following two types are not extent types,
13881389
// they are just used to indicates paddr allocation deltas
1389-
ALLOC_INFO = 10,
1390-
JOURNAL_TAIL = 11,
1390+
ALLOC_INFO = 11,
1391+
JOURNAL_TAIL = 12,
13911392
// Test Block Types
1392-
TEST_BLOCK = 12,
1393-
TEST_BLOCK_PHYSICAL = 13,
1394-
BACKREF_INTERNAL = 14,
1395-
BACKREF_LEAF = 15,
1393+
TEST_BLOCK = 13,
1394+
TEST_BLOCK_PHYSICAL = 14,
1395+
BACKREF_INTERNAL = 15,
1396+
BACKREF_LEAF = 16,
13961397
// None and the number of valid extent_types_t
1397-
NONE = 16,
1398+
NONE = 17,
13981399
};
13991400
using extent_types_le_t = uint8_t;
14001401
constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
@@ -1409,12 +1410,12 @@ constexpr bool is_data_type(extent_types_t type) {
14091410
}
14101411

14111412
constexpr bool is_logical_metadata_type(extent_types_t type) {
1412-
return type >= extent_types_t::OMAP_INNER &&
1413+
return type >= extent_types_t::ROOT_META &&
14131414
type <= extent_types_t::COLL_BLOCK;
14141415
}
14151416

14161417
constexpr bool is_logical_type(extent_types_t type) {
1417-
if ((type >= extent_types_t::OMAP_INNER &&
1418+
if ((type >= extent_types_t::ROOT_META &&
14181419
type <= extent_types_t::OBJECT_DATA_BLOCK) ||
14191420
type == extent_types_t::TEST_BLOCK) {
14201421
assert(is_logical_metadata_type(type) ||
@@ -1926,44 +1927,18 @@ using backref_root_t = phy_tree_root_t;
19261927
* TODO: generalize this to permit more than one lba_manager implementation
19271928
*/
19281929
struct __attribute__((packed)) root_t {
1929-
using meta_t = std::map<std::string, std::string>;
1930-
1931-
static constexpr int MAX_META_LENGTH = 1024;
1932-
19331930
backref_root_t backref_root;
19341931
lba_root_t lba_root;
19351932
laddr_le_t onode_root;
19361933
coll_root_le_t collection_root;
1934+
laddr_le_t meta;
19371935

1938-
char meta[MAX_META_LENGTH];
1939-
1940-
root_t() {
1941-
set_meta(meta_t{});
1942-
}
1936+
root_t() = default;
19431937

19441938
void adjust_addrs_from_base(paddr_t base) {
19451939
lba_root.adjust_addrs_from_base(base);
19461940
backref_root.adjust_addrs_from_base(base);
19471941
}
1948-
1949-
meta_t get_meta() {
1950-
bufferlist bl;
1951-
bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
1952-
meta_t ret;
1953-
auto iter = bl.cbegin();
1954-
decode(ret, iter);
1955-
return ret;
1956-
}
1957-
1958-
void set_meta(const meta_t &m) {
1959-
ceph::bufferlist bl;
1960-
encode(m, bl);
1961-
ceph_assert(bl.length() < MAX_META_LENGTH);
1962-
bl.rebuild();
1963-
auto &bptr = bl.front();
1964-
::memset(meta, 0, MAX_META_LENGTH);
1965-
::memcpy(meta, bptr.c_str(), bl.length());
1966-
}
19671942
};
19681943

19691944
struct alloc_blk_t {

src/crimson/os/seastore/transaction_manager.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
7474
return lba_manager->mkfs(t);
7575
}).si_then([this, &t] {
7676
return backref_manager->mkfs(t);
77+
}).si_then([this, &t] {
78+
return init_root_meta(t);
7779
}).si_then([this, FNAME, &t] {
7880
INFOT("submitting mkfs transaction", t);
7981
return submit_transaction_direct(t);

src/crimson/os/seastore/transaction_manager.h

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "crimson/os/seastore/logging.h"
2424
#include "crimson/os/seastore/seastore_types.h"
2525
#include "crimson/os/seastore/cache.h"
26+
#include "crimson/os/seastore/root_meta.h"
2627
#include "crimson/os/seastore/lba_manager.h"
2728
#include "crimson/os/seastore/backref_manager.h"
2829
#include "crimson/os/seastore/journal.h"
@@ -690,9 +691,11 @@ class TransactionManager : public ExtentCallbackInterface {
690691
const std::string &key) {
691692
return cache->get_root(
692693
t
693-
).si_then([&key, &t](auto root) {
694+
).si_then([&t, this](auto root) {
695+
return read_extent<RootMetaBlock>(t, root->root.meta);
696+
}).si_then([key, &t](auto mblock) {
694697
LOG_PREFIX(TransactionManager::read_root_meta);
695-
auto meta = root->root.get_meta();
698+
auto meta = mblock->get_meta();
696699
auto iter = meta.find(key);
697700
if (iter == meta.end()) {
698701
SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key);
@@ -701,7 +704,35 @@ class TransactionManager : public ExtentCallbackInterface {
701704
SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second);
702705
return seastar::make_ready_future<read_root_meta_bare>(iter->second);
703706
}
704-
});
707+
}).handle_error_interruptible(
708+
crimson::ct_error::input_output_error::pass_further{},
709+
crimson::ct_error::assert_all{"unexpected error!"}
710+
);
711+
}
712+
713+
/**
714+
* init_root_meta
715+
*
716+
* create the root meta block
717+
*/
718+
using init_root_meta_iertr = base_iertr;
719+
using init_root_meta_ret = init_root_meta_iertr::future<>;
720+
init_root_meta_ret init_root_meta(Transaction &t) {
721+
return alloc_non_data_extent<RootMetaBlock>(
722+
t, L_ADDR_MIN, RootMetaBlock::SIZE
723+
).si_then([this, &t](auto meta) {
724+
meta->set_meta(RootMetaBlock::meta_t{});
725+
return cache->get_root(t
726+
).si_then([this, &t, meta](auto root) {
727+
auto mroot = cache->duplicate_for_write(
728+
t, root)->template cast<RootBlock>();
729+
mroot->root.meta = meta->get_laddr();
730+
return seastar::now();
731+
});
732+
}).handle_error_interruptible(
733+
crimson::ct_error::input_output_error::pass_further{},
734+
crimson::ct_error::assert_all{"unexpected error!"}
735+
);
705736
}
706737

707738
/**
@@ -719,15 +750,21 @@ class TransactionManager : public ExtentCallbackInterface {
719750
SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
720751
return cache->get_root(
721752
t
722-
).si_then([this, &t, &key, &value](RootBlockRef root) {
723-
root = cache->duplicate_for_write(t, root)->cast<RootBlock>();
753+
).si_then([this, &t](RootBlockRef root) {
754+
return read_extent<RootMetaBlock>(t, root->root.meta);
755+
}).si_then([this, key, value, &t](auto mblock) {
756+
mblock = get_mutable_extent(t, mblock
757+
)->template cast<RootMetaBlock>();
724758

725-
auto meta = root->root.get_meta();
759+
auto meta = mblock->get_meta();
726760
meta[key] = value;
727761

728-
root->root.set_meta(meta);
762+
mblock->set_meta(meta);
729763
return seastar::now();
730-
});
764+
}).handle_error_interruptible(
765+
crimson::ct_error::input_output_error::pass_further{},
766+
crimson::ct_error::assert_all{"unexpected error!"}
767+
);
731768
}
732769

733770
/**

0 commit comments

Comments
 (0)