Skip to content

Commit 6b1d0de

Browse files
committed
Merge PR ceph#56816 into main
* refs/pull/56816/head: doc: mention the peer status failed when snapshot created on the remote filesystem. qa: add test_cephfs_mirror_remote_snap_corrupt_fails_synced_snapshot cephfs_mirror: update peer status for invalid metadata in remote snapshot Reviewed-by: Venky Shankar <[email protected]> Reviewed-by: Anthony D Atri <[email protected]>
2 parents 76226b3 + ce10e5e commit 6b1d0de

File tree

4 files changed

+122
-2
lines changed

4 files changed

+122
-2
lines changed

doc/cephfs/cephfs-mirroring.rst

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,44 @@ This allows a user to add a non-existent directory for synchronization. The mirr
423423
will mark such a directory as failed and retry (less frequently). When the directory is
424424
created, the mirror daemon will clear the failed state upon successful synchronization.
425425

426+
Adding a new snapshot or a new directory manually in the .snap directory of the
427+
remote filesystem will result in failed status of the corresponding configured directory.
428+
In the remote filesystem::
429+
430+
$ ceph fs subvolume snapshot create cephfs subvol1 snap2 group1
431+
or
432+
$ mkdir /d0/.snap/snap2
433+
434+
$ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
435+
{
436+
"/d0": {
437+
"state": "failed",
438+
"failure_reason": "snapshot 'snap2' has invalid metadata",
439+
"last_synced_snap": {
440+
"id": 120,
441+
"name": "snap1",
442+
"sync_duration": 0.079997898999999997,
443+
"sync_time_stamp": "274900.558797s"
444+
},
445+
"snaps_synced": 2,
446+
"snaps_deleted": 0,
447+
"snaps_renamed": 0
448+
},
449+
"/f0": {
450+
"state": "failed",
451+
"snaps_synced": 0,
452+
"snaps_deleted": 0,
453+
"snaps_renamed": 0
454+
}
455+
}
456+
457+
When the snapshot or the directory is removed from the remote filesystem, the mirror daemon will
458+
clear the failed state upon successful synchronization of the pending snapshots, if any.
459+
460+
.. note:: Treat the remote filesystem as read-only. Nothing is inherently enforced by CephFS.
461+
But with the right mds caps, users would not be able to snapshot directories in the
462+
remote file system.
463+
426464
When mirroring is disabled, the respective `fs mirror status` command for the file system
427465
will not show up in command help.
428466

qa/tasks/cephfs/test_mirroring.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ def check_peer_status(self, fs_name, fs_id, peer_spec, dir_name, expected_snap_n
204204
self.assertTrue(res[dir_name]['last_synced_snap']['name'] == expected_snap_name)
205205
self.assertTrue(res[dir_name]['snaps_synced'] == expected_snap_count)
206206

207+
def check_peer_status_idle(self, fs_name, fs_id, peer_spec, dir_name, expected_snap_name,
208+
expected_snap_count):
209+
peer_uuid = self.get_peer_uuid(peer_spec)
210+
res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
211+
'fs', 'mirror', 'peer', 'status',
212+
f'{fs_name}@{fs_id}', peer_uuid)
213+
self.assertTrue(dir_name in res)
214+
self.assertTrue('idle' == res[dir_name]['state'])
215+
self.assertTrue(expected_snap_name == res[dir_name]['last_synced_snap']['name'])
216+
self.assertTrue(expected_snap_count == res[dir_name]['snaps_synced'])
217+
207218
def check_peer_status_deleted_snap(self, fs_name, fs_id, peer_spec, dir_name,
208219
expected_delete_count):
209220
peer_uuid = self.get_peer_uuid(peer_spec)
@@ -1499,3 +1510,68 @@ def test_get_set_mirror_dirty_snap_id(self):
14991510
self.mount_b.setfattr("d1/d2/d3", "ceph.mirror.dirty_snap_id", attr)
15001511
val = self.mount_b.getfattr("d1/d2/d3", "ceph.mirror.dirty_snap_id")
15011512
self.assertEqual(attr, val, f"Mismatch for ceph.mirror.dirty_snap_id value: {attr} vs {val}")
1513+
1514+
def test_cephfs_mirror_remote_snap_corrupt_fails_synced_snapshot(self):
1515+
"""
1516+
That making manual changes to the remote .snap directory shows 'peer status' state: "failed"
1517+
for a synced snapshot and then restores to "idle" when those changes are reverted.
1518+
"""
1519+
log.debug('reconfigure client auth caps')
1520+
self.get_ceph_cmd_result(
1521+
'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
1522+
'mds', 'allow rwps',
1523+
'mon', 'allow r',
1524+
'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
1525+
self.backup_fs.get_data_pool_name(),
1526+
self.backup_fs.get_data_pool_name()))
1527+
log.debug(f'mounting filesystem {self.secondary_fs_name}')
1528+
self.mount_b.umount_wait()
1529+
self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
1530+
1531+
self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
1532+
peer_spec = "client.mirror_remote@ceph"
1533+
self.peer_add(self.primary_fs_name, self.primary_fs_id, peer_spec, self.secondary_fs_name)
1534+
dir_name = 'd0'
1535+
self.mount_a.run_shell(['mkdir', dir_name])
1536+
self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir_name}')
1537+
1538+
# take a snapshot
1539+
snap_name = "snap_a"
1540+
expected_snap_count = 1
1541+
self.mount_a.run_shell(['mkdir', f'{dir_name}/.snap/{snap_name}'])
1542+
1543+
time.sleep(30)
1544+
# confirm snapshot synced and status 'idle'
1545+
self.check_peer_status_idle(self.primary_fs_name, self.primary_fs_id,
1546+
peer_spec, f'/{dir_name}', snap_name, expected_snap_count)
1547+
1548+
remote_snap_name = 'snap_b'
1549+
remote_snap_path = f'{dir_name}/.snap/{remote_snap_name}'
1550+
failure_reason = f"snapshot '{remote_snap_name}' has invalid metadata"
1551+
dir_name = f'/{dir_name}'
1552+
1553+
# create a directory in the remote fs and check status 'failed'
1554+
self.mount_b.run_shell(['sudo', 'mkdir', remote_snap_path], omit_sudo=False)
1555+
peer_uuid = self.get_peer_uuid(peer_spec)
1556+
with safe_while(sleep=1, tries=60, action=f'wait for failed status: {peer_spec}') as proceed:
1557+
while proceed():
1558+
res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
1559+
'fs', 'mirror', 'peer', 'status',
1560+
f'{self.primary_fs_name}@{self.primary_fs_id}', peer_uuid)
1561+
if('failed' == res[dir_name]['state'] and \
1562+
failure_reason == res.get(dir_name, {}).get('failure_reason', {}) and \
1563+
snap_name == res[dir_name]['last_synced_snap']['name'] and \
1564+
expected_snap_count == res[dir_name]['snaps_synced']):
1565+
break
1566+
# remove the directory in the remote fs and check status restores to 'idle'
1567+
self.mount_b.run_shell(['sudo', 'rmdir', remote_snap_path], omit_sudo=False)
1568+
with safe_while(sleep=1, tries=60, action=f'wait for idle status: {peer_spec}') as proceed:
1569+
while proceed():
1570+
res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
1571+
'fs', 'mirror', 'peer', 'status',
1572+
f'{self.primary_fs_name}@{self.primary_fs_id}', peer_uuid)
1573+
if('idle' == res[dir_name]['state'] and 'failure_reason' not in res and \
1574+
snap_name == res[dir_name]['last_synced_snap']['name'] and \
1575+
expected_snap_count == res[dir_name]['snaps_synced']):
1576+
break
1577+
self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)

src/tools/cephfs_mirror/PeerReplayer.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,8 +528,9 @@ int PeerReplayer::build_snap_map(const std::string &dir_root,
528528
uint64_t snap_id;
529529
if (is_remote) {
530530
if (!info.nr_snap_metadata) {
531-
derr << ": snap_path=" << snap_path << " has invalid metadata in remote snapshot"
532-
<< dendl;
531+
std::string failed_reason = "snapshot '" + snap + "' has invalid metadata";
532+
derr << ": " << failed_reason << dendl;
533+
m_snap_sync_stats.at(dir_root).last_failed_reason = failed_reason;
533534
rv = -EINVAL;
534535
} else {
535536
auto metadata = decode_snap_metadata(info.snap_metadata, info.nr_snap_metadata);
@@ -1807,6 +1808,9 @@ void PeerReplayer::peer_status(Formatter *f) {
18071808
f->open_object_section(dir_root);
18081809
if (sync_stat.failed) {
18091810
f->dump_string("state", "failed");
1811+
if (sync_stat.last_failed_reason) {
1812+
f->dump_string("failure_reason", *sync_stat.last_failed_reason);
1813+
}
18101814
} else if (!sync_stat.current_syncing_snap) {
18111815
f->dump_string("state", "idle");
18121816
} else {

src/tools/cephfs_mirror/PeerReplayer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ class PeerReplayer {
141141
struct SnapSyncStat {
142142
uint64_t nr_failures = 0; // number of consecutive failures
143143
boost::optional<monotime> last_failed; // lat failed timestamp
144+
boost::optional<std::string> last_failed_reason;
144145
bool failed = false; // hit upper cap for consecutive failures
145146
boost::optional<std::pair<uint64_t, std::string>> last_synced_snap;
146147
boost::optional<std::pair<uint64_t, std::string>> current_syncing_snap;
@@ -177,6 +178,7 @@ class PeerReplayer {
177178
sync_stat.nr_failures = 0;
178179
sync_stat.failed = false;
179180
sync_stat.last_failed = boost::none;
181+
sync_stat.last_failed_reason = boost::none;
180182
}
181183

182184
void _set_last_synced_snap(const std::string &dir_root, uint64_t snap_id,

0 commit comments

Comments
 (0)