Skip to content

Commit 8b896a9

Browse files
committed
mds/quiesce-db: keep the db thread alive until shutdown
With the change we can now avoid having to join it during the membership update, preventing potential deadlocks Signed-off-by: Leonid Usov <[email protected]>
1 parent 3e012f7 commit 8b896a9

File tree

4 files changed

+73
-73
lines changed

4 files changed

+73
-73
lines changed

src/mds/MDSRank.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -864,17 +864,15 @@ void MDSRankDispatcher::shutdown()
864864

865865
progress_thread.shutdown();
866866

867-
if (quiesce_db_manager) {
868-
// shutdown the manager
869-
quiesce_db_manager->update_membership({});
870-
}
871-
872867
// release mds_lock for finisher/messenger threads (e.g.
873868
// MDSDaemon::ms_handle_reset called from Messenger).
874869
mds_lock.unlock();
875870

876871
// shut down messenger
877872
messenger->shutdown();
873+
874+
// the quiesce db membership is
875+
// managed by the mds map update, no need to address that here
878876
if (quiesce_agent) {
879877
// reset any tracked roots
880878
quiesce_agent->shutdown();

src/mds/QuiesceDbManager.cc

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#define dout_context g_ceph_context
2323
#define dout_subsys ceph_subsys_mds_quiesce
2424
#undef dout_prefix
25-
#define dout_prefix *_dout << "quiesce.mgr <" << __func__ << "> "
25+
#define dout_prefix *_dout << "quiesce.mgr." << membership.me << " <" << __func__ << "> "
2626

2727
#undef dout
2828
#define dout(lvl) \
@@ -54,35 +54,34 @@ static QuiesceTimeInterval time_distance(QuiesceTimePoint lhs, QuiesceTimePoint
5454

5555
bool QuiesceDbManager::db_thread_has_work() const
5656
{
57-
return false
57+
return db_thread_should_exit
5858
|| pending_acks.size() > 0
5959
|| pending_requests.size() > 0
6060
|| pending_db_updates.size() > 0
6161
|| (agent_callback.has_value() && agent_callback->if_newer < db_version())
62-
|| (!cluster_membership.has_value() || cluster_membership->epoch != membership.epoch);
62+
|| (cluster_membership.has_value() && cluster_membership->epoch != membership.epoch);
6363
}
6464

6565
void* QuiesceDbManager::quiesce_db_thread_main()
6666
{
67-
db_thread_enter();
68-
6967
std::unique_lock ls(submit_mutex);
7068
QuiesceTimeInterval next_event_at_age = QuiesceTimeInterval::max();
7169
QuiesceDbVersion last_acked = {0, 0};
7270

73-
while (true) {
71+
dout(5) << "Entering the main thread" << dendl;
72+
bool keep_working = true;
73+
while (keep_working) {
7474

7575
auto db_age = db.get_age();
7676

7777
if (!db_thread_has_work() && next_event_at_age > db_age) {
7878
submit_condition.wait_for(ls, next_event_at_age - db_age);
7979
}
8080

81-
if (!membership_upkeep()) {
82-
break;
83-
}
81+
auto [is_member, should_exit] = membership_upkeep();
82+
keep_working = !should_exit;
8483

85-
{
84+
if (is_member) {
8685
decltype(pending_acks) acks(std::move(pending_acks));
8786
decltype(pending_requests) requests(std::move(pending_requests));
8887
decltype(pending_db_updates) db_updates(std::move(pending_db_updates));
@@ -105,6 +104,10 @@ void* QuiesceDbManager::quiesce_db_thread_main()
105104
} else {
106105
next_event_at_age = replica_upkeep(std::move(db_updates));
107106
}
107+
} else {
108+
ls.unlock();
109+
dout(15) << "not a cluster member, keeping idle " << dendl;
110+
next_event_at_age = QuiesceTimeInterval::max();
108111
}
109112

110113
complete_requests();
@@ -131,7 +134,7 @@ void* QuiesceDbManager::quiesce_db_thread_main()
131134
}
132135
}
133136

134-
if (send_ack) {
137+
if (is_member && send_ack) {
135138
auto db_version = quiesce_map.db_version;
136139
dout(20) << "synchronous agent ack: " << quiesce_map << dendl;
137140
auto rc = membership.send_ack(std::move(quiesce_map));
@@ -148,7 +151,7 @@ void* QuiesceDbManager::quiesce_db_thread_main()
148151

149152
ls.unlock();
150153

151-
db_thread_exit();
154+
dout(5) << "Exiting the main thread" << dendl;
152155

153156
return 0;
154157
}
@@ -160,43 +163,36 @@ void QuiesceDbManager::update_membership(const QuiesceClusterMembership& new_mem
160163
bool will_participate = new_membership.members.contains(new_membership.me);
161164
dout(20) << "will participate: " << std::boolalpha << will_participate << std::noboolalpha << dendl;
162165

163-
if (cluster_membership && !will_participate) {
164-
// stop the thread
165-
cluster_membership.reset();
166+
if (will_participate && !quiesce_db_thread.is_started()) {
167+
// start the thread
168+
dout(5) << "starting the db mgr thread at epoch: " << new_membership.epoch << dendl;
169+
db_thread_should_exit = false;
170+
quiesce_db_thread.create("quiesce_db_mgr");
171+
} else {
166172
submit_condition.notify_all();
167-
lock.unlock();
168-
ceph_assert(quiesce_db_thread.is_started());
169-
dout(5) << "stopping the db mgr thread at epoch: " << new_membership.epoch << dendl;
170-
quiesce_db_thread.join();
171-
} else if (will_participate) {
172-
if (!cluster_membership) {
173-
// start the thread
174-
dout(5) << "starting the db mgr thread at epoch: " << new_membership.epoch << dendl;
175-
quiesce_db_thread.create("quiesce_db_mgr");
176-
} else {
177-
submit_condition.notify_all();
178-
}
179-
if (inject_request) {
180-
pending_requests.push_front(inject_request);
181-
}
173+
}
174+
175+
if (inject_request) {
176+
pending_requests.push_front(inject_request);
177+
}
178+
179+
if (will_participate) {
182180
cluster_membership = new_membership;
183-
184-
std::lock_guard lc(agent_mutex);
185-
if (agent_callback) {
186-
agent_callback->if_newer = {0, 0};
187-
}
181+
} else {
182+
cluster_membership.reset();
188183
}
189184

190-
if (!will_participate && inject_request) {
191-
inject_request->complete(-EPERM);
185+
std::lock_guard lc(agent_mutex);
186+
if (agent_callback) {
187+
agent_callback->if_newer = {0, 0};
192188
}
193189
}
194190

195-
bool QuiesceDbManager::membership_upkeep()
191+
std::pair<QuiesceDbManager::IsMemberBool, QuiesceDbManager::ShouldExitBool> QuiesceDbManager::membership_upkeep()
196192
{
197193
if (cluster_membership && cluster_membership->epoch == membership.epoch) {
198194
// no changes
199-
return true;
195+
return {true, db_thread_should_exit};
200196
}
201197

202198
bool was_leader = membership.epoch > 0 && membership.leader == membership.me;
@@ -206,7 +202,7 @@ bool QuiesceDbManager::membership_upkeep()
206202
<< std::boolalpha << was_leader << "->" << is_leader << std::noboolalpha
207203
<< " members:" << cluster_membership->members << dendl;
208204
} else {
209-
dout(10) << "shutdown! was_leader: " << was_leader << dendl;
205+
dout(10) << "not a member! was_leader: " << was_leader << dendl;
210206
}
211207

212208
if (is_leader) {
@@ -238,18 +234,24 @@ bool QuiesceDbManager::membership_upkeep()
238234
done_requests[await_ctx.req_ctx] = EINPROGRESS;
239235
}
240236
awaits.clear();
241-
// reject pending requests
237+
// reject pending requests as not leader
242238
while (!pending_requests.empty()) {
243-
done_requests[pending_requests.front()] = EPERM;
239+
done_requests[pending_requests.front()] = ENOTTY;
244240
pending_requests.pop_front();
245241
}
246242
}
247243

248244
if (cluster_membership) {
249245
membership = *cluster_membership;
246+
dout(15) << "Updated membership" << dendl;
247+
} else {
248+
membership.epoch = 0;
249+
peers.clear();
250+
awaits.clear();
251+
db.clear();
250252
}
251253

252-
return cluster_membership.has_value();
254+
return { cluster_membership.has_value(), db_thread_should_exit };
253255
}
254256

255257
QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates)&& db_updates)
@@ -291,7 +293,7 @@ QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates
291293
if (db.set_version > update.db_version.set_version) {
292294
dout(3) << "got an older version of DB from the leader: " << db.set_version << " > " << update.db_version.set_version << dendl;
293295
dout(3) << "discarding the DB" << dendl;
294-
db.reset();
296+
db.clear();
295297
} else {
296298
for (auto& [qs_id, qs] : update.sets) {
297299
db.sets.insert_or_assign(qs_id, std::move(qs));

src/mds/QuiesceDbManager.h

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,19 @@ class QuiesceDbManager {
4949
QuiesceDbManager() : quiesce_db_thread(this) {};
5050
virtual ~QuiesceDbManager()
5151
{
52+
shutdown();
53+
}
54+
55+
void shutdown() {
5256
update_membership({});
57+
58+
if (quiesce_db_thread.is_started()) {
59+
submit_mutex.lock();
60+
db_thread_should_exit = true;
61+
submit_condition.notify_all();
62+
submit_mutex.unlock();
63+
quiesce_db_thread.join();
64+
}
5365
}
5466

5567
// This will reset the manager state
@@ -191,6 +203,7 @@ class QuiesceDbManager {
191203
std::queue<QuiesceDbPeerListing> pending_db_updates;
192204
std::queue<QuiesceDbPeerAck> pending_acks;
193205
std::deque<RequestContext*> pending_requests;
206+
bool db_thread_should_exit = false;
194207

195208
class QuiesceDbThread : public Thread {
196209
public:
@@ -220,7 +233,7 @@ class QuiesceDbManager {
220233
QuiesceTimeInterval get_age() const {
221234
return QuiesceClock::now() - time_zero;
222235
}
223-
void reset() {
236+
void clear() {
224237
set_version = 0;
225238
sets.clear();
226239
time_zero = QuiesceClock::now();
@@ -257,23 +270,11 @@ class QuiesceDbManager {
257270
std::unordered_map<RequestContext*, int> done_requests;
258271

259272
void* quiesce_db_thread_main();
260-
261-
void db_thread_enter() {
262-
// this will invalidate the membership, see membership_upkeep()
263-
membership.epoch = 0;
264-
peers.clear();
265-
awaits.clear();
266-
done_requests.clear();
267-
db.reset();
268-
}
269-
270-
void db_thread_exit() {
271-
complete_requests();
272-
}
273-
274273
bool db_thread_has_work() const;
275274

276-
bool membership_upkeep();
275+
using IsMemberBool = bool;
276+
using ShouldExitBool = bool;
277+
std::pair<IsMemberBool, ShouldExitBool> membership_upkeep();
277278

278279
QuiesceTimeInterval replica_upkeep(decltype(pending_db_updates)&& db_updates);
279280
bool leader_bootstrap(decltype(pending_db_updates)&& db_updates, QuiesceTimeInterval &next_event_at_age);

src/test/mds/TestQuiesceDb.cc

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -230,18 +230,16 @@ class QuiesceDbTest: public testing::Test {
230230
response.clear();
231231
request.reset(c);
232232

233-
int rr = -1;
233+
int rr = -ENOTTY;
234234

235235
for (auto& [rank, mgr] : parent.managers) {
236236
if (!(rr = mgr->submit_request(this))) {
237237
break;
238238
}
239239
}
240240

241-
if (rr == EPERM) {
242-
// change the error to something never returned for a request
243-
// EPIPE seems reasonable as we couldn't find the leader to send the command to
244-
complete(EPIPE);
241+
if (rr) {
242+
complete(rr);
245243
return false;
246244
}
247245

@@ -358,6 +356,7 @@ TEST_F(QuiesceDbTest, ManagerStartup) {
358356
ASSERT_EQ(OK(), run_request_for(100, [](auto& r) {}));
359357
ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(2) }));
360358
ASSERT_EQ(OK(), run_request_for(100, [](auto& r) {}));
359+
managers[mds_gid_t(2)]->shutdown();
361360
ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2) }));
362361
ASSERT_EQ(OK(), run_request_for(100, [](auto& r) {}));
363362
}
@@ -1328,9 +1327,9 @@ TEST_F(QuiesceDbTest, LeaderShutdown)
13281327

13291328
ASSERT_EQ(managers.at(mds_gid_t(1))->internal_pending_requests().size(), pending_requests.size());
13301329

1331-
// reset the membership of the manager
1330+
// shutdown the manager
13321331
// this will block until the db thread exits
1333-
managers.at(mds_gid_t(1))->update_membership({});
1332+
managers.at(mds_gid_t(1))->shutdown();
13341333

13351334
// as of now all requests must have finished
13361335
while(!outstanding_awaits.empty()) {
@@ -1341,7 +1340,7 @@ TEST_F(QuiesceDbTest, LeaderShutdown)
13411340

13421341
while (!pending_requests.empty()) {
13431342
auto& r = *pending_requests.front();
1344-
EXPECT_EQ(ERR(EPERM), r.check_result());
1343+
EXPECT_EQ(ERR(ENOTTY), r.check_result());
13451344
pending_requests.pop();
13461345
}
13471346
}

0 commit comments

Comments
 (0)