Skip to content

Commit 62eb727

Browse files
authored
Merge pull request ceph#56193 from joscollin/wip-B64927-test_cephfs_mirror_blocklist-fail
cephfs_mirror, qa: fix mirror daemon doesn't restart when blocklisted or failed Reviewed-by: Venky Shankar <[email protected]>
2 parents e60cb05 + a9a5691 commit 62eb727

File tree

9 files changed

+70
-62
lines changed

9 files changed

+70
-62
lines changed

qa/tasks/cephfs/test_mirroring.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ def get_mirror_rados_addr(self, fs_name, fs_id):
275275
"""return the rados addr used by cephfs-mirror instance"""
276276
res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
277277
'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
278-
return res['rados_inst']
278+
if 'rados_inst' in res:
279+
return res['rados_inst']
279280

280281
def mirror_daemon_command(self, cmd_label, *args):
281282
asok_path = self.get_daemon_admin_socket()
@@ -491,6 +492,7 @@ def test_cephfs_mirror_blocklist(self):
491492

492493
# fetch rados address for blacklist check
493494
rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
495+
self.assertTrue(rados_inst)
494496

495497
# simulate non-responding mirror daemon by sending SIGSTOP
496498
pid = self.get_mirror_daemon_pid()
@@ -509,9 +511,16 @@ def test_cephfs_mirror_blocklist(self):
509511
# check if the rados addr is blocklisted
510512
self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst))
511513

512-
# wait enough so that the mirror daemon restarts blocklisted instances
513-
time.sleep(40)
514-
rados_inst_new = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
514+
# wait for restart, which is after 30 seconds timeout (cephfs_mirror_restart_mirror_on_blocklist_interval)
515+
time.sleep(60)
516+
517+
# get the new rados_inst
518+
rados_inst_new = ""
519+
with safe_while(sleep=2, tries=20, action='wait for mirror status rados_inst') as proceed:
520+
while proceed():
521+
rados_inst_new = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
522+
if rados_inst_new:
523+
break
515524

516525
# and we should get a new rados instance
517526
self.assertTrue(rados_inst != rados_inst_new)

src/tools/cephfs_mirror/FSMirror.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ FSMirror::FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool
114114
m_args(args),
115115
m_work_queue(work_queue),
116116
m_snap_listener(this),
117+
m_ts_listener(this),
117118
m_asok_hook(new MirrorAdminSocketHook(cct, filesystem, this)) {
118119
m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
119120
(uint64_t)0);
@@ -270,7 +271,7 @@ void FSMirror::init_instance_watcher(Context *on_finish) {
270271

271272
Context *ctx = new C_CallbackAdapter<
272273
FSMirror, &FSMirror::handle_init_instance_watcher>(this);
273-
m_instance_watcher = InstanceWatcher::create(m_ioctx, m_snap_listener, m_work_queue);
274+
m_instance_watcher = InstanceWatcher::create(m_ioctx, m_snap_listener, m_ts_listener, m_work_queue);
274275
m_instance_watcher->init(ctx);
275276
}
276277

@@ -299,7 +300,7 @@ void FSMirror::init_mirror_watcher() {
299300
std::scoped_lock locker(m_lock);
300301
Context *ctx = new C_CallbackAdapter<
301302
FSMirror, &FSMirror::handle_init_mirror_watcher>(this);
302-
m_mirror_watcher = MirrorWatcher::create(m_ioctx, this, m_work_queue);
303+
m_mirror_watcher = MirrorWatcher::create(m_ioctx, this, m_ts_listener, m_work_queue);
303304
m_mirror_watcher->init(ctx);
304305
}
305306

src/tools/cephfs_mirror/FSMirror.h

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,12 @@ class FSMirror {
5959

6060
monotime get_failed_ts() {
6161
std::scoped_lock locker(m_lock);
62-
if (m_instance_watcher) {
63-
return m_instance_watcher->get_failed_ts();
64-
}
65-
if (m_mirror_watcher) {
66-
return m_mirror_watcher->get_failed_ts();
67-
}
62+
return m_failed_ts;
63+
}
6864

69-
return clock::now();
65+
void set_failed_ts() {
66+
std::scoped_lock locker(m_lock);
67+
m_failed_ts = clock::now();
7068
}
7169

7270
bool is_blocklisted() {
@@ -76,14 +74,12 @@ class FSMirror {
7674

7775
monotime get_blocklisted_ts() {
7876
std::scoped_lock locker(m_lock);
79-
if (m_instance_watcher) {
80-
return m_instance_watcher->get_blocklisted_ts();
81-
}
82-
if (m_mirror_watcher) {
83-
return m_mirror_watcher->get_blocklisted_ts();
84-
}
77+
return m_blocklisted_ts;
78+
}
8579

86-
return clock::now();
80+
void set_blocklisted_ts() {
81+
std::scoped_lock locker(m_lock);
82+
m_blocklisted_ts = clock::now();
8783
}
8884

8985
Peers get_peers() {
@@ -128,8 +124,24 @@ class FSMirror {
128124
void release_directory(std::string_view dir_path) override {
129125
fs_mirror->handle_release_directory(dir_path);
130126
}
127+
128+
};
129+
130+
struct TimestampListener: public Watcher::ErrorListener {
131+
FSMirror *fs_mirror;
132+
TimestampListener(FSMirror *fs_mirror)
133+
: fs_mirror(fs_mirror) {
134+
}
135+
void set_blocklisted_ts() {
136+
fs_mirror->set_blocklisted_ts();
137+
}
138+
void set_failed_ts() {
139+
fs_mirror->set_failed_ts();
140+
}
131141
};
132142

143+
monotime m_blocklisted_ts;
144+
monotime m_failed_ts;
133145
CephContext *m_cct;
134146
Filesystem m_filesystem;
135147
uint64_t m_pool_id;
@@ -139,6 +151,7 @@ class FSMirror {
139151

140152
ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::fs_mirror");
141153
SnapListener m_snap_listener;
154+
TimestampListener m_ts_listener;
142155
std::set<std::string, std::less<>> m_directories;
143156
Peers m_all_peers;
144157
std::map<Peer, std::unique_ptr<PeerReplayer>> m_peer_replayers;

src/tools/cephfs_mirror/InstanceWatcher.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,11 @@ std::string instance_oid(const std::string &instance_id) {
3131
} // anonymous namespace
3232

3333
InstanceWatcher::InstanceWatcher(librados::IoCtx &ioctx,
34-
Listener &listener, ContextWQ *work_queue)
34+
Listener &listener, ErrorListener &elistener, ContextWQ *work_queue)
3535
: Watcher(ioctx, instance_oid(stringify(ioctx.get_instance_id())), work_queue),
3636
m_ioctx(ioctx),
3737
m_listener(listener),
38+
m_elistener(elistener),
3839
m_work_queue(work_queue),
3940
m_lock(ceph::make_mutex("cephfs::mirror::instance_watcher")) {
4041
}
@@ -116,15 +117,15 @@ void InstanceWatcher::handle_rewatch_complete(int r) {
116117
dout(0) << ": client blocklisted" <<dendl;
117118
std::scoped_lock locker(m_lock);
118119
m_blocklisted = true;
119-
m_blocklisted_ts = clock::now();
120+
m_elistener.set_blocklisted_ts();
120121
} else if (r == -ENOENT) {
121122
derr << ": mirroring object deleted" << dendl;
122123
m_failed = true;
123-
m_failed_ts = clock::now();
124+
m_elistener.set_failed_ts();
124125
} else if (r < 0) {
125126
derr << ": rewatch error: " << cpp_strerror(r) << dendl;
126127
m_failed = true;
127-
m_failed_ts = clock::now();
128+
m_elistener.set_failed_ts();
128129
}
129130
}
130131

src/tools/cephfs_mirror/InstanceWatcher.h

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ class InstanceWatcher : public Watcher {
3131
};
3232

3333
static InstanceWatcher *create(librados::IoCtx &ioctx,
34-
Listener &listener, ContextWQ *work_queue) {
35-
return new InstanceWatcher(ioctx, listener, work_queue);
34+
Listener &listener, ErrorListener &elistener, ContextWQ *work_queue) {
35+
return new InstanceWatcher(ioctx, listener, elistener, work_queue);
3636
}
3737

38-
InstanceWatcher(librados::IoCtx &ioctx, Listener &listener, ContextWQ *work_queue);
38+
InstanceWatcher(librados::IoCtx &ioctx, Listener &listener, ErrorListener &elistener, ContextWQ *work_queue);
3939
~InstanceWatcher();
4040

4141
void init(Context *on_finish);
@@ -50,24 +50,15 @@ class InstanceWatcher : public Watcher {
5050
return m_blocklisted;
5151
}
5252

53-
monotime get_blocklisted_ts() {
54-
std::scoped_lock locker(m_lock);
55-
return m_blocklisted_ts;
56-
}
57-
5853
bool is_failed() {
5954
std::scoped_lock locker(m_lock);
6055
return m_failed;
6156
}
6257

63-
monotime get_failed_ts() {
64-
std::scoped_lock locker(m_lock);
65-
return m_failed_ts;
66-
}
67-
6858
private:
6959
librados::IoCtx &m_ioctx;
7060
Listener &m_listener;
61+
ErrorListener &m_elistener;
7162
ContextWQ *m_work_queue;
7263

7364
ceph::mutex m_lock;
@@ -77,9 +68,6 @@ class InstanceWatcher : public Watcher {
7768
bool m_blocklisted = false;
7869
bool m_failed = false;
7970

80-
monotime m_blocklisted_ts;
81-
monotime m_failed_ts;
82-
8371
void create_instance();
8472
void handle_create_instance(int r);
8573

src/tools/cephfs_mirror/Mirror.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -558,9 +558,9 @@ void Mirror::update_fs_mirrors() {
558558
std::scoped_lock locker(m_lock);
559559
for (auto &[filesystem, mirror_action] : m_mirror_actions) {
560560
auto failed_restart = mirror_action.fs_mirror && mirror_action.fs_mirror->is_failed() &&
561-
(failed_interval.count() > 0 && duration_cast<seconds>(mirror_action.fs_mirror->get_failed_ts() - clock::now()) > failed_interval);
561+
(failed_interval.count() > 0 && duration_cast<seconds>(clock::now() - mirror_action.fs_mirror->get_failed_ts()).count() > failed_interval.count());
562562
auto blocklisted_restart = mirror_action.fs_mirror && mirror_action.fs_mirror->is_blocklisted() &&
563-
(blocklist_interval.count() > 0 && duration_cast<seconds>(mirror_action.fs_mirror->get_blocklisted_ts() - clock::now()) > blocklist_interval);
563+
(blocklist_interval.count() > 0 && duration_cast<seconds>(clock::now() - mirror_action.fs_mirror->get_blocklisted_ts()).count() > blocklist_interval.count());
564564

565565
if (!mirror_action.action_in_progress && !_is_restarting(filesystem)) {
566566
if (failed_restart || blocklisted_restart) {

src/tools/cephfs_mirror/MirrorWatcher.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ namespace cephfs {
2121
namespace mirror {
2222

2323
MirrorWatcher::MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror,
24-
ContextWQ *work_queue)
24+
ErrorListener &elistener, ContextWQ *work_queue)
2525
: Watcher(ioctx, CEPHFS_MIRROR_OBJECT, work_queue),
2626
m_ioctx(ioctx),
2727
m_fs_mirror(fs_mirror),
28+
m_elistener(elistener),
2829
m_work_queue(work_queue),
2930
m_lock(ceph::make_mutex("cephfs::mirror::mirror_watcher")),
3031
m_instance_id(stringify(m_ioctx.get_instance_id())) {
@@ -92,15 +93,15 @@ void MirrorWatcher::handle_rewatch_complete(int r) {
9293
dout(0) << ": client blocklisted" <<dendl;
9394
std::scoped_lock locker(m_lock);
9495
m_blocklisted = true;
95-
m_blocklisted_ts = clock::now();
96+
m_elistener.set_blocklisted_ts();
9697
} else if (r == -ENOENT) {
9798
derr << ": mirroring object deleted" << dendl;
9899
m_failed = true;
99-
m_failed_ts = clock::now();
100+
m_elistener.set_failed_ts();
100101
} else if (r < 0) {
101102
derr << ": rewatch error: " << cpp_strerror(r) << dendl;
102103
m_failed = true;
103-
m_failed_ts = clock::now();
104+
m_elistener.set_failed_ts();
104105
}
105106
}
106107

src/tools/cephfs_mirror/MirrorWatcher.h

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ class FSMirror;
2828
class MirrorWatcher : public Watcher {
2929
public:
3030
static MirrorWatcher *create(librados::IoCtx &ioctx, FSMirror *fs_mirror,
31-
ContextWQ *work_queue) {
32-
return new MirrorWatcher(ioctx, fs_mirror, work_queue);
31+
ErrorListener &elistener, ContextWQ *work_queue) {
32+
return new MirrorWatcher(ioctx, fs_mirror, elistener, work_queue);
3333
}
3434

35-
MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror,
35+
MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror, ErrorListener &elistener,
3636
ContextWQ *work_queue);
3737
~MirrorWatcher();
3838

@@ -48,24 +48,15 @@ class MirrorWatcher : public Watcher {
4848
return m_blocklisted;
4949
}
5050

51-
monotime get_blocklisted_ts() {
52-
std::scoped_lock locker(m_lock);
53-
return m_blocklisted_ts;
54-
}
55-
5651
bool is_failed() {
5752
std::scoped_lock locker(m_lock);
5853
return m_failed;
5954
}
6055

61-
monotime get_failed_ts() {
62-
std::scoped_lock locker(m_lock);
63-
return m_failed_ts;
64-
}
65-
6656
private:
6757
librados::IoCtx &m_ioctx;
6858
FSMirror *m_fs_mirror;
59+
ErrorListener &m_elistener;
6960
ContextWQ *m_work_queue;
7061

7162
ceph::mutex m_lock;
@@ -77,9 +68,6 @@ class MirrorWatcher : public Watcher {
7768
bool m_blocklisted = false;
7869
bool m_failed = false;
7970

80-
monotime m_blocklisted_ts;
81-
monotime m_failed_ts;
82-
8371
void register_watcher();
8472
void handle_register_watcher(int r);
8573

src/tools/cephfs_mirror/Watcher.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ class Watcher {
2828
void register_watch(Context *on_finish);
2929
void unregister_watch(Context *on_finish);
3030

31+
struct ErrorListener {
32+
virtual ~ErrorListener() {
33+
}
34+
virtual void set_blocklisted_ts() = 0;
35+
virtual void set_failed_ts() = 0;
36+
};
37+
3138
protected:
3239
std::string m_oid;
3340

0 commit comments

Comments
 (0)