Skip to content

Commit 7599257

Browse files
committed
mds/quiesce: resolve the quiesce cluster at the mds monitor
Signed-off-by: Leonid Usov <[email protected]>
1 parent 42a5fb3 commit 7599257

File tree

13 files changed

+175
-120
lines changed

13 files changed

+175
-120
lines changed

qa/tasks/mgr/dashboard/test_health.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ class HealthTest(DashboardTestCase):
6363
'balance_automate': bool,
6464
}),
6565
'ever_allowed_features': int,
66-
'root': int
66+
'root': int,
67+
'qdb_leader': int,
68+
'qdb_cluster': JList(int)
6769
})
6870

6971
def test_minimal_health(self):

src/include/cephfs/types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@
4848
BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
4949
extern const mds_gid_t MDS_GID_NONE;
5050

51+
template <>
52+
struct std::hash<mds_gid_t> {
53+
size_t operator()(const mds_gid_t& gid) const
54+
{
55+
return hash<uint64_t> {}(gid);
56+
}
57+
};
58+
5159
typedef int32_t fs_cluster_id_t;
5260
constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
5361

src/mds/FSMap.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <set>
2222
#include <string>
2323
#include <string_view>
24+
#include <type_traits>
2425

2526
#include <errno.h>
2627

@@ -479,9 +480,18 @@ class FSMap {
479480
void modify_filesystem(fs_cluster_id_t fscid, T&& fn)
480481
{
481482
auto& fs = filesystems.at(fscid);
482-
fn(fs);
483-
fs.mds_map.epoch = epoch;
484-
fs.mds_map.modified = ceph_clock_now();
483+
bool did_update = true;
484+
485+
if constexpr (std::is_convertible_v<std::invoke_result_t<T, Filesystem&>, bool>) {
486+
did_update = fn(fs);
487+
} else {
488+
fn(fs);
489+
}
490+
491+
if (did_update) {
492+
fs.mds_map.epoch = epoch;
493+
fs.mds_map.modified = ceph_clock_now();
494+
}
485495
}
486496

487497
/* This is method is written for the option of "ceph fs swap" commmand

src/mds/MDSMap.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,12 @@ void MDSMap::dump(Formatter *f) const
228228
f->dump_string("balancer", balancer);
229229
f->dump_string("bal_rank_mask", bal_rank_mask);
230230
f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
231+
f->dump_unsigned("qdb_leader", qdb_cluster_leader);
232+
f->open_array_section("qdb_cluster");
233+
for (auto m: qdb_cluster_members) {
234+
f->dump_int("member", m);
235+
}
236+
f->close_section();
231237
}
232238

233239
void MDSMap::dump_flags_state(Formatter *f) const
@@ -290,6 +296,7 @@ void MDSMap::print(ostream& out) const
290296
out << "balancer\t" << balancer << "\n";
291297
out << "bal_rank_mask\t" << bal_rank_mask << "\n";
292298
out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
299+
out << "qdb_cluster\tleader: " << qdb_cluster_leader << " members: " << qdb_cluster_members << std::endl;
293300

294301
multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
295302
for (const auto &p : mds_info) {
@@ -773,7 +780,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
773780
encode(data_pools, bl);
774781
encode(cas_pool, bl);
775782

776-
__u16 ev = 18;
783+
__u16 ev = 19;
777784
encode(ev, bl);
778785
encode(compat, bl);
779786
encode(metadata_pool, bl);
@@ -802,6 +809,8 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
802809
encode(required_client_features, bl);
803810
encode(bal_rank_mask, bl);
804811
encode(max_xattr_size, bl);
812+
encode(qdb_cluster_leader, bl);
813+
encode(qdb_cluster_members, bl);
805814
ENCODE_FINISH(bl);
806815
}
807816

@@ -957,6 +966,11 @@ void MDSMap::decode(bufferlist::const_iterator& p)
957966
decode(max_xattr_size, p);
958967
}
959968

969+
if (ev >= 19) {
970+
decode(qdb_cluster_leader, p);
971+
decode(qdb_cluster_members, p);
972+
}
973+
960974
/* All MDS since at least v14.0.0 understand INLINE */
961975
/* TODO: remove after R is released */
962976
compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);

src/mds/MDSMap.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <map>
2020
#include <set>
2121
#include <string>
22+
#include <ranges>
2223
#include <string_view>
2324

2425
#include <errno.h>
@@ -312,6 +313,29 @@ class MDSMap {
312313
mds_rank_t get_tableserver() const { return tableserver; }
313314
mds_rank_t get_root() const { return root; }
314315

316+
void get_quiesce_db_cluster(mds_gid_t &leader, std::unordered_set<mds_gid_t> &members) const {
317+
leader = qdb_cluster_leader;
318+
members = qdb_cluster_members;
319+
}
320+
321+
mds_gid_t get_quiesce_db_cluster_leader() {
322+
return qdb_cluster_leader;
323+
}
324+
325+
bool update_quiesce_db_cluster(mds_gid_t const& leader, std::same_as<std::unordered_set<mds_gid_t>> auto && members) {
326+
if (leader == qdb_cluster_leader && members == qdb_cluster_members) {
327+
return false;
328+
}
329+
330+
ceph_assert(leader == MDS_GID_NONE || mds_info.contains(leader));
331+
ceph_assert(std::ranges::all_of(members, [this](auto &m) {return mds_info.contains(m);}));
332+
333+
qdb_cluster_leader = leader;
334+
qdb_cluster_members = members;
335+
336+
return true;
337+
}
338+
315339
const std::vector<int64_t> &get_data_pools() const { return data_pools; }
316340
int64_t get_first_data_pool() const { return *data_pools.begin(); }
317341
int64_t get_metadata_pool() const { return metadata_pool; }
@@ -634,6 +658,8 @@ class MDSMap {
634658

635659
mds_rank_t tableserver = 0; // which MDS has snaptable
636660
mds_rank_t root = 0; // which MDS has root directory
661+
std::unordered_set<mds_gid_t> qdb_cluster_members;
662+
mds_gid_t qdb_cluster_leader = MDS_GID_NONE;
637663

638664
__u32 session_timeout = 60;
639665
__u32 session_autoclose = 300;

src/mds/MDSRankQuiesce.cc

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::function<void(int,
101101
struct Ctx : public QuiesceDbManager::RequestContext {
102102
std::function<void(int, const std::string&, bufferlist&)> on_finish;
103103
bool all = false;
104+
mds_gid_t me;
104105

105106
double sec(QuiesceTimeInterval duration) {
106107
return duration_cast<dd>(duration).count();
@@ -126,6 +127,7 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::function<void(int,
126127

127128
f->open_object_section("response"); {
128129
f->dump_int("epoch", response.db_version.epoch);
130+
f->dump_int("leader", me);
129131
f->dump_int("set_version", response.db_version.set_version);
130132
f->open_object_section("sets"); {
131133
for (auto&& [set_id, set] : response.sets) {
@@ -168,8 +170,10 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::function<void(int,
168170

169171
auto* ctx = new Ctx();
170172

173+
QuiesceInterface::PeerId me = mds_gid_t(monc->get_global_id());
171174
ctx->on_finish = std::move(on_finish);
172175
ctx->all = all;
176+
ctx->me = me;
173177

174178
ctx->request.reset([&](auto& r) {
175179
r.set_id = set_id;
@@ -212,6 +216,12 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::function<void(int,
212216
int rc = quiesce_db_manager->submit_request(ctx);
213217
if (rc != 0) {
214218
bufferlist bl;
219+
auto f = Formatter::create_unique("json-pretty");
220+
f->open_object_section("response");
221+
f->dump_int("epoch", mdsmap->get_epoch());
222+
f->dump_int("leader", mdsmap->get_quiesce_db_cluster_leader());
223+
f->close_section();
224+
f->flush(bl);
215225
// on_finish was moved there, so should only call via the ctx.
216226
ctx->on_finish(rc, "Error submitting the command to the local db manager", bl);
217227
delete ctx;
@@ -234,62 +244,35 @@ static void rebind_agent_callback(std::shared_ptr<QuiesceAgent> agt, std::shared
234244

235245
void MDSRank::quiesce_cluster_update() {
236246
// the quiesce leader is the lowest rank with the highest state up to ACTIVE
237-
auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) {
238-
ceph_assert(l->rank != MDS_RANK_NONE);
239-
ceph_assert(r->rank != MDS_RANK_NONE);
240-
ceph_assert(l->state <= MDSMap::STATE_ACTIVE);
241-
ceph_assert(r->state <= MDSMap::STATE_ACTIVE);
242-
if (l->rank == r->rank) {
243-
return l->state < r->state;
244-
} else {
245-
return l->rank > r->rank;
246-
}
247-
};
248-
249-
std::priority_queue<MDSMap::mds_info_t const*, std::vector<MDSMap::mds_info_t const*>, decltype(less_leader)> member_info(less_leader);
250247
QuiesceClusterMembership membership;
251-
252248
QuiesceInterface::PeerId me = mds_gid_t(monc->get_global_id());
253249

254-
for (auto&& [gid, info] : mdsmap->get_mds_info()) {
255-
// if it has a rank and state <= ACTIVE, it's good enough
256-
// if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) {
257-
if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) {
258-
member_info.push(&info);
259-
membership.members.insert(info.global_id);
260-
}
261-
}
262-
263-
QuiesceInterface::PeerId leader =
264-
member_info.empty()
265-
? QuiesceClusterMembership::INVALID_MEMBER
266-
: member_info.top()->global_id;
250+
mdsmap->get_quiesce_db_cluster(membership.leader, membership.members);
267251

268252
membership.epoch = mdsmap->get_epoch();
269-
membership.leader = leader;
270253
membership.me = me;
271254
membership.fs_name = mdsmap->get_fs_name();
272255

273-
dout(5) << "epoch:" << membership.epoch << " me:" << me << " leader:" << leader << " members:" << membership.members
256+
dout(5) << "epoch:" << membership.epoch << " me:" << me << " leader:" << membership.leader << " members:" << membership.members
274257
<< (mdsmap->is_degraded() ? " (degraded)" : "") << dendl;
275258

276-
if (leader != QuiesceClusterMembership::INVALID_MEMBER) {
259+
if (membership.leader != QuiesceClusterMembership::INVALID_MEMBER) {
277260
membership.send_ack = [=, this](QuiesceMap&& ack) {
278-
if (me == leader) {
261+
if (me == membership.leader) {
279262
// loopback
280263
quiesce_db_manager->submit_ack_from(me, std::move(ack));
281264
return 0;
282265
} else {
283266
std::lock_guard guard(mds_lock);
284267

285-
if (mdsmap->get_state_gid(leader) == MDSMap::STATE_NULL) {
286-
dout(5) << "couldn't find the leader " << leader << " in the map" << dendl;
268+
if (mdsmap->get_state_gid(membership.leader) == MDSMap::STATE_NULL) {
269+
dout(5) << "couldn't find the leader " << membership.leader << " in the map" << dendl;
287270
return -ENOENT;
288271
}
289-
auto addrs = mdsmap->get_info_gid(leader).addrs;
272+
auto addrs = mdsmap->get_info_gid(membership.leader).addrs;
290273

291274
auto ack_msg = make_message<MMDSQuiesceDbAck>();
292-
dout(10) << "sending ack " << ack << " to the leader " << leader << dendl;
275+
dout(10) << "sending ack " << ack << " to the leader " << membership.leader << dendl;
293276
ack_msg->encode_payload_from(me, ack);
294277
return send_message_mds(ack_msg, addrs);
295278
}

src/mds/QuiesceDbManager.h

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,6 @@
1919
#include <set>
2020
#include <queue>
2121

22-
template <>
23-
struct std::hash<mds_gid_t> {
24-
size_t operator()(const mds_gid_t& gid) const
25-
{
26-
return hash<uint64_t> {}(gid);
27-
}
28-
};
29-
3022
struct QuiesceClusterMembership {
3123
static const QuiesceInterface::PeerId INVALID_MEMBER;
3224

@@ -36,7 +28,7 @@ struct QuiesceClusterMembership {
3628

3729
QuiesceInterface::PeerId me = INVALID_MEMBER;
3830
QuiesceInterface::PeerId leader = INVALID_MEMBER;
39-
std::set<QuiesceInterface::PeerId> members;
31+
std::unordered_set<QuiesceInterface::PeerId> members;
4032

4133
// A courier interface to decouple from the messaging layer
4234
// Failures can be ignored, manager will call this repeatedly if needed
@@ -69,7 +61,7 @@ class QuiesceDbManager {
6961

7062
// ============================
7163
// quiesce db leader interface:
72-
// -> EPERM unless this is the leader
64+
// -> ENOTTY unless this is the leader
7365

7466
// client interface to the DB
7567
int submit_request(RequestContext* ctx) {

src/mon/MDSMonitor.cc

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#include <regex>
1616
#include <sstream>
17+
#include <queue>
18+
#include <ranges>
1719
#include <boost/utility.hpp>
1820

1921
#include "MDSMonitor.h"
@@ -174,11 +176,61 @@ void MDSMonitor::create_pending()
174176
dout(10) << "create_pending e" << fsmap.get_epoch() << dendl;
175177
}
176178

179+
void MDSMonitor::assign_quiesce_db_leader(FSMap &fsmap) {
180+
181+
// the quiesce leader is the lowest rank with the highest state up to ACTIVE
182+
auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) {
183+
ceph_assert(l->rank != MDS_RANK_NONE);
184+
ceph_assert(r->rank != MDS_RANK_NONE);
185+
ceph_assert(l->state <= MDSMap::STATE_ACTIVE);
186+
ceph_assert(r->state <= MDSMap::STATE_ACTIVE);
187+
if (l->rank == r->rank) {
188+
return l->state < r->state;
189+
} else {
190+
return l->rank > r->rank;
191+
}
192+
};
193+
194+
for (const auto& [fscid, fs] : std::as_const(fsmap)) {
195+
auto &&mdsmap = fs.get_mds_map();
196+
197+
if (mdsmap.get_epoch() < fsmap.get_epoch()) {
198+
// no changes in this fs, we can skip the calculation below
199+
// NB! be careful with this clause when updating the leader selection logic.
200+
// When the input from outside of this fsmap will affect the decision
201+
// this clause will have to be updated, too.
202+
continue;
203+
}
204+
205+
std::priority_queue<MDSMap::mds_info_t const*, std::vector<MDSMap::mds_info_t const*>, decltype(less_leader)>
206+
member_info(less_leader);
207+
208+
std::unordered_set<mds_gid_t> members;
209+
210+
for (auto&& [gid, info] : mdsmap.get_mds_info()) {
211+
// if it has a rank and state <= ACTIVE, it's good enough
212+
// if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) {
213+
if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) {
214+
member_info.push(&info);
215+
members.insert(info.global_id);
216+
}
217+
}
218+
219+
auto leader = member_info.empty() ? MDS_GID_NONE : member_info.top()->global_id;
220+
221+
fsmap.modify_filesystem(fscid, [&leader, &members](auto &writable_fs) -> bool {
222+
return writable_fs.get_mds_map().update_quiesce_db_cluster(leader, std::move(members));
223+
});
224+
}
225+
}
226+
177227
void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
178228
{
179229
auto &pending = get_pending_fsmap_writeable();
180230
auto epoch = pending.get_epoch();
181231

232+
assign_quiesce_db_leader(pending);
233+
182234
dout(10) << "encode_pending e" << epoch << dendl;
183235

184236
// print map iff 'debug mon = 30' or higher

src/mon/MDSMonitor.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
129129
int load_metadata(std::map<mds_gid_t, Metadata>& m);
130130
void count_metadata(const std::string& field, ceph::Formatter *f);
131131

132+
void assign_quiesce_db_leader(FSMap &fsmap);
133+
132134
public:
133135
void print_fs_summary(std::ostream& out) {
134136
get_fsmap().print_fs_summary(out);

0 commit comments

Comments
 (0)