Skip to content

Commit 930ed08

Browse files
authored
Merge pull request ceph#54613 from Matan-B/wip-matanb-crimson-build-inc-maps
crimson/osd: Support incremental maps Reviewed-by: Samuel Just <[email protected]>
2 parents 7455933 + 57030a0 commit 930ed08

File tree

6 files changed

+164
-49
lines changed

6 files changed

+164
-49
lines changed

src/crimson/osd/heartbeat.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,9 +333,8 @@ seastar::future<> Heartbeat::maybe_share_osdmap(
333333
return seastar::now();
334334
}
335335

336-
const epoch_t send_from = peer.get_projected_epoch();
337-
logger().debug("{} sending peer {} peer maps from projected epoch {} through "
338-
"local osdmap epoch {}",
336+
const epoch_t send_from = peer.get_projected_epoch() + 1;
337+
logger().debug("{} sending peer {} peer maps ({}, {}]",
339338
__func__,
340339
from,
341340
send_from,

src/crimson/osd/osd_meta.cc

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include "osd/OSDMap.h"
1313

1414
using std::string;
15-
using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
1615

1716
void OSDMeta::create(ceph::os::Transaction& t)
1817
{
@@ -25,11 +24,22 @@ void OSDMeta::store_map(ceph::os::Transaction& t,
2524
t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m);
2625
}
2726

27+
void OSDMeta::store_inc_map(ceph::os::Transaction& t,
28+
epoch_t e, const bufferlist& m)
29+
{
30+
t.write(coll->get_cid(), inc_osdmap_oid(e), 0, m.length(), m);
31+
}
32+
2833
void OSDMeta::remove_map(ceph::os::Transaction& t, epoch_t e)
2934
{
3035
t.remove(coll->get_cid(), osdmap_oid(e));
3136
}
3237

38+
void OSDMeta::remove_inc_map(ceph::os::Transaction& t, epoch_t e)
39+
{
40+
t.remove(coll->get_cid(), inc_osdmap_oid(e));
41+
}
42+
3343
seastar::future<bufferlist> OSDMeta::load_map(epoch_t e)
3444
{
3545
return store.read(coll,
@@ -41,6 +51,13 @@ seastar::future<bufferlist> OSDMeta::load_map(epoch_t e)
4151
}));
4252
}
4353

54+
read_errorator::future<ceph::bufferlist> OSDMeta::load_inc_map(epoch_t e)
55+
{
56+
return store.read(coll,
57+
osdmap_oid(e), 0, 0,
58+
CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
59+
}
60+
4461
void OSDMeta::store_superblock(ceph::os::Transaction& t,
4562
const OSDSuperblock& superblock)
4663
{
@@ -122,6 +139,12 @@ ghobject_t OSDMeta::osdmap_oid(epoch_t epoch)
122139
return ghobject_t(hobject_t(sobject_t(object_t(name), 0)));
123140
}
124141

142+
ghobject_t OSDMeta::inc_osdmap_oid(epoch_t epoch)
143+
{
144+
string name = fmt::format("inc_osdmap.{}", epoch);
145+
return ghobject_t(hobject_t(sobject_t(object_t(name), 0)));
146+
}
147+
125148
ghobject_t OSDMeta::final_pool_info_oid(int64_t pool)
126149
{
127150
string name = fmt::format("final_pool_{}", pool);

src/crimson/osd/osd_meta.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ namespace crimson::os {
1919
class FuturizedStore;
2020
}
2121

22+
using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
23+
2224
/// metadata shared across PGs, or put in another way,
2325
/// metadata not specific to certain PGs.
2426
class OSDMeta {
@@ -40,8 +42,13 @@ class OSDMeta {
4042

4143
void store_map(ceph::os::Transaction& t,
4244
epoch_t e, const bufferlist& m);
45+
void store_inc_map(ceph::os::Transaction& t,
46+
epoch_t e, const bufferlist& m);
4347
void remove_map(ceph::os::Transaction& t, epoch_t e);
48+
void remove_inc_map(ceph::os::Transaction& t, epoch_t e);
49+
4450
seastar::future<bufferlist> load_map(epoch_t e);
51+
read_errorator::future<ceph::bufferlist> load_inc_map(epoch_t e);
4552

4653
void store_superblock(ceph::os::Transaction& t,
4754
const OSDSuperblock& sb);
@@ -60,6 +67,7 @@ class OSDMeta {
6067
std::map<epoch_t, OSDMap*>&);
6168
private:
6269
static ghobject_t osdmap_oid(epoch_t epoch);
70+
static ghobject_t inc_osdmap_oid(epoch_t epoch);
6371
static ghobject_t final_pool_info_oid(int64_t pool);
6472
static ghobject_t superblock_oid();
6573
};

src/crimson/osd/osdmap_service.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ class OSDMapService {
1212
public:
1313
using cached_map_t = OSDMapRef;
1414
using local_cached_map_t = LocalOSDMapRef;
15+
enum class encoded_osdmap_type_t {
16+
FULLMAP,
17+
INCMAP
18+
};
19+
using bls_pair = std::pair<encoded_osdmap_type_t, bufferlist>;
20+
using bls_map_pair_t = std::pair<epoch_t, bls_pair>;
21+
using bls_map_t = std::map<epoch_t, bls_pair>;
1522

1623
virtual ~OSDMapService() = default;
1724
virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0;

src/crimson/osd/shard_services.cc

Lines changed: 112 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,14 @@ void OSDSingletonState::store_map_bl(
379379
map_bl_cache.insert(e, std::move(bl));
380380
}
381381

382+
void OSDSingletonState::store_inc_map_bl(
383+
ceph::os::Transaction& t,
384+
epoch_t e, bufferlist&& bl)
385+
{
386+
meta_coll->store_inc_map(t, e, bl);
387+
inc_map_bl_cache.insert(e, std::move(bl));
388+
}
389+
382390
seastar::future<bufferlist> OSDSingletonState::load_map_bl(
383391
epoch_t e)
384392
{
@@ -387,29 +395,56 @@ seastar::future<bufferlist> OSDSingletonState::load_map_bl(
387395
return seastar::make_ready_future<bufferlist>(*found);
388396
} else {
389397
logger().debug("{} loading osdmap.{} from disk", __func__, e);
390-
return meta_coll->load_map(e);
398+
return meta_coll->load_map(e).then([this, e](auto&& bl) {
399+
map_bl_cache.insert(e, bl);
400+
return seastar::make_ready_future<bufferlist>(std::move(bl));
401+
});
391402
}
392403
}
393404

394-
seastar::future<std::map<epoch_t, bufferlist>> OSDSingletonState::load_map_bls(
405+
read_errorator::future<ceph::bufferlist> OSDSingletonState::load_inc_map_bl(
406+
epoch_t e)
407+
{
408+
if (std::optional<bufferlist> found = inc_map_bl_cache.find(e); found) {
409+
logger().debug("{} inc map.{} found in cache", __func__, e);
410+
return read_errorator::make_ready_future<bufferlist>(*found);
411+
} else {
412+
logger().debug("{} loading inc map.{} from disk", __func__, e);
413+
return meta_coll->load_inc_map(e).safe_then([this, e](auto&& bl) {
414+
inc_map_bl_cache.insert(e, bl);
415+
return seastar::make_ready_future<bufferlist>(std::move(bl));
416+
}, read_errorator::pass_further{});
417+
}
418+
}
419+
420+
seastar::future<OSDMapService::bls_map_t> OSDSingletonState::load_map_bls(
395421
epoch_t first,
396422
epoch_t last)
397423
{
398424
logger().debug("{} loading maps [{},{}]",
399425
__func__, first, last);
400426
ceph_assert(first <= last);
401-
// TODO: take osd_map_max into account
402-
//int max = cct->_conf->osd_map_message_max;
403-
//ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
404427
return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first),
405428
boost::make_counting_iterator<epoch_t>(last + 1),
406429
[this](epoch_t e) {
407-
return load_map_bl(e).then([e](auto&& bl) {
408-
return seastar::make_ready_future<std::pair<epoch_t, bufferlist>>(
409-
std::make_pair(e, std::move(bl)));
430+
return load_inc_map_bl(e).safe_then([](auto&& bl) {
431+
return seastar::make_ready_future<OSDMapService::bls_pair>(
432+
std::make_pair(OSDMapService::encoded_osdmap_type_t::INCMAP,
433+
std::move(bl)));
434+
}, read_errorator::all_same_way([this, e] {
435+
logger().debug("load_map_bls: can't load inc map {}, attempting full map instread",
436+
e);
437+
return load_map_bl(e).then([](auto&& bl) {
438+
return seastar::make_ready_future<OSDMapService::bls_pair>(
439+
std::make_pair(OSDMapService::encoded_osdmap_type_t::FULLMAP,
440+
std::move(bl)));
441+
});
442+
})).then([e] (auto&& loaded_map) {
443+
return seastar::make_ready_future<OSDMapService::bls_map_pair_t>(
444+
std::make_pair(e, std::move(loaded_map)));
410445
});
411446
},
412-
std::map<epoch_t, bufferlist>{},
447+
OSDMapService::bls_map_t{},
413448
[](auto&& bls, auto&& epoch_bl) {
414449
bls.emplace(std::move(epoch_bl));
415450
return std::move(bls);
@@ -453,11 +488,12 @@ seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t,
453488
"loading osdmap.{}", e, e - 1);
454489
ceph_assert(std::cmp_greater(e, 0u));
455490
return load_map(e - 1).then(
456-
[&added_maps, e, bl=p->second, &t, this](auto o) {
491+
[&added_maps, e, bl=p->second, &t, this](auto o) mutable {
457492
OSDMap::Incremental inc;
458493
auto i = bl.cbegin();
459494
inc.decode(i);
460495
o->apply_incremental(inc);
496+
store_inc_map_bl(t, e, std::move(bl));
461497
bufferlist fbl;
462498
o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
463499
logger().info("store_maps storing osdmap.{}", o->get_epoch());
@@ -499,6 +535,7 @@ void OSDSingletonState::trim_maps(ceph::os::Transaction& t,
499535
t.get_num_ops() < crimson::common::local_conf()->osd_target_transaction_size) {
500536
logger().debug("{}: removing old osdmap epoch {}", __func__, superblock.get_oldest_map());
501537
meta_coll->remove_map(t, superblock.get_oldest_map());
538+
meta_coll->remove_inc_map(t, superblock.get_oldest_map());
502539
superblock.maps.erase(superblock.get_oldest_map());
503540
}
504541

@@ -757,49 +794,80 @@ seastar::future<> ShardServices::dispatch_context(
757794
});
758795
}
759796

760-
seastar::future<> OSDSingletonState::send_incremental_map(
761-
crimson::net::Connection &conn,
762-
epoch_t first)
797+
seastar::future<MURef<MOSDMap>> OSDSingletonState::build_incremental_map_msg(
798+
epoch_t first,
799+
epoch_t last)
763800
{
764-
logger().info("{}: first osdmap: {} "
765-
"superblock's oldest map: {}",
766-
__func__, first, superblock.get_oldest_map());
767-
if (first >= superblock.get_oldest_map()) {
768-
// TODO: osd_map_share_max_epochs
769-
// See OSDService::build_incremental_map_msg
801+
return seastar::do_with(crimson::common::local_conf()->osd_map_message_max,
802+
crimson::make_message<MOSDMap>(
803+
monc.get_fsid(),
804+
osdmap->get_encoding_features()),
805+
[this, &first, last](unsigned int map_message_max,
806+
auto& m) {
807+
m->cluster_osdmap_trim_lower_bound = superblock.cluster_osdmap_trim_lower_bound;
808+
m->newest_map = superblock.get_newest_map();
809+
auto maybe_handle_mapgap = seastar::now();
770810
if (first < superblock.cluster_osdmap_trim_lower_bound) {
771811
logger().info("{}: cluster osdmap lower bound: {} "
772-
" > first {}, starting with full map",
773-
__func__, superblock.cluster_osdmap_trim_lower_bound, first);
812+
" > first {}, starting with full map",
813+
__func__, superblock.cluster_osdmap_trim_lower_bound, first);
774814
// we don't have the next map the target wants,
775815
// so start with a full map.
776816
first = superblock.cluster_osdmap_trim_lower_bound;
817+
maybe_handle_mapgap = load_map_bl(first).then(
818+
[&first, &map_message_max, &m](auto&& bl) {
819+
m->maps[first] = std::move(bl);
820+
--map_message_max;
821+
++first;
822+
});
777823
}
778-
return load_map_bls(
779-
first, superblock.get_newest_map()
780-
).then([this, &conn](auto&& bls) {
781-
auto m = crimson::make_message<MOSDMap>(
782-
monc.get_fsid(),
783-
osdmap->get_encoding_features());
784-
m->cluster_osdmap_trim_lower_bound = superblock.cluster_osdmap_trim_lower_bound;
785-
m->newest_map = superblock.get_newest_map();
786-
m->maps = std::move(bls);
787-
return conn.send(std::move(m));
788-
});
789-
} else {
790-
// See OSDService::send_incremental_map
791-
// just send latest full map
792-
return load_map_bl(osdmap->get_epoch()
793-
).then([this, &conn](auto&& bl) mutable {
794-
auto m = crimson::make_message<MOSDMap>(
795-
monc.get_fsid(),
796-
osdmap->get_encoding_features());
797-
m->cluster_osdmap_trim_lower_bound = superblock.cluster_osdmap_trim_lower_bound;
798-
m->newest_map = superblock.get_newest_map();
799-
m->maps.emplace(osdmap->get_epoch(), std::move(bl));
800-
return conn.send(std::move(m));
824+
return maybe_handle_mapgap.then([this, first, last, &map_message_max, &m] {
825+
if (first > last) {
826+
// first may be later than last in the case of map gap
827+
ceph_assert(!m->maps.empty());
828+
return seastar::make_ready_future<MURef<MOSDMap>>(std::move(m));
829+
}
830+
return load_map_bls(
831+
first,
832+
((last - first) > map_message_max) ? (first + map_message_max) : last
833+
).then([&m](auto&& bls) {
834+
ssize_t map_message_max_bytes = crimson::common::local_conf()->osd_map_message_max_bytes;
835+
for (auto const& [e, val] : bls) {
836+
map_message_max_bytes -= val.second.length();
837+
if (map_message_max_bytes < 0) {
838+
break;
839+
}
840+
if (val.first == OSDMapService::encoded_osdmap_type_t::FULLMAP) {
841+
m->maps.emplace(e, std::move(val.second));
842+
} else if (val.first == OSDMapService::encoded_osdmap_type_t::INCMAP) {
843+
m->incremental_maps.emplace(e, std::move(val.second));
844+
} else {
845+
ceph_abort();
846+
}
847+
}
848+
return seastar::make_ready_future<MURef<MOSDMap>>(std::move(m));
849+
});
801850
});
851+
});
852+
}
853+
854+
seastar::future<> OSDSingletonState::send_incremental_map(
855+
crimson::net::Connection &conn,
856+
epoch_t first)
857+
{
858+
epoch_t to = osdmap->get_epoch();
859+
logger().info("{}: first osdmap: {} "
860+
"superblock's oldest map: {}, "
861+
"to {}",
862+
__func__, first, superblock.get_oldest_map(), to);
863+
if (to > first && (int64_t)(to - first) > crimson::common::local_conf()->osd_map_share_max_epochs) {
864+
logger().debug("{} {} > max epochs to send of {}, only sending most recent,",
865+
__func__, (to - first), crimson::common::local_conf()->osd_map_share_max_epochs);
866+
first = to - crimson::common::local_conf()->osd_map_share_max_epochs;
802867
}
868+
return build_incremental_map_msg(first, to).then([&conn](auto&& m) {
869+
return conn.send(std::move(m));
870+
});
803871
}
804872

805873
seastar::future<> OSDSingletonState::send_incremental_map_to_osd(

src/crimson/osd/shard_services.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ class OSDSingletonState : public md_config_obs_t {
218218
friend class OSD;
219219
using cached_map_t = OSDMapService::cached_map_t;
220220
using local_cached_map_t = OSDMapService::local_cached_map_t;
221+
using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
221222

222223
public:
223224
OSDSingletonState(
@@ -236,6 +237,7 @@ class OSDSingletonState : public md_config_obs_t {
236237

237238
SharedLRU<epoch_t, OSDMap> osdmaps;
238239
SimpleLRU<epoch_t, bufferlist, false> map_bl_cache;
240+
SimpleLRU<epoch_t, bufferlist, false> inc_map_bl_cache;
239241

240242
cached_map_t osdmap;
241243
cached_map_t &get_osdmap() { return osdmap; }
@@ -268,6 +270,10 @@ class OSDSingletonState : public md_config_obs_t {
268270
superblock = std::move(_superblock);
269271
}
270272

273+
seastar::future<MURef<MOSDMap>> build_incremental_map_msg(
274+
epoch_t first,
275+
epoch_t last);
276+
271277
seastar::future<> send_incremental_map(
272278
crimson::net::Connection &conn,
273279
epoch_t first);
@@ -318,10 +324,13 @@ class OSDSingletonState : public md_config_obs_t {
318324
seastar::future<local_cached_map_t> get_local_map(epoch_t e);
319325
seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e);
320326
seastar::future<bufferlist> load_map_bl(epoch_t e);
321-
seastar::future<std::map<epoch_t, bufferlist>>
327+
read_errorator::future<ceph::bufferlist> load_inc_map_bl(epoch_t e);
328+
seastar::future<OSDMapService::bls_map_t>
322329
load_map_bls(epoch_t first, epoch_t last);
323330
void store_map_bl(ceph::os::Transaction& t,
324331
epoch_t e, bufferlist&& bl);
332+
void store_inc_map_bl(ceph::os::Transaction& t,
333+
epoch_t e, bufferlist&& bl);
325334
seastar::future<> store_maps(ceph::os::Transaction& t,
326335
epoch_t start, Ref<MOSDMap> m);
327336
void trim_maps(ceph::os::Transaction& t, OSDSuperblock& superblock);
@@ -505,6 +514,7 @@ class ShardServices : public OSDMapService {
505514
FORWARD_TO_OSD_SINGLETON(get_pool_info)
506515
FORWARD(with_throttle_while, with_throttle_while, local_state.throttler)
507516

517+
FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg)
508518
FORWARD_TO_OSD_SINGLETON(send_incremental_map)
509519
FORWARD_TO_OSD_SINGLETON(send_incremental_map_to_osd)
510520

0 commit comments

Comments
 (0)