Skip to content

Commit a68e564

Browse files
lxbszidryomov
authored andcommitted
ceph: blocklist the kclient when receiving corrupted snap trace
When received corrupted snap trace we don't know what exactly has happened in MDS side. And we shouldn't continue IOs and metadatas access to MDS, which may corrupt or get incorrect contents. This patch will just block all the further IO/MDS requests immediately and then evict the kclient itself. The reason why we still need to evict the kclient just after blocking all the further IOs is that the MDS could revoke the caps faster. Link: https://tracker.ceph.com/issues/57686 Signed-off-by: Xiubo Li <[email protected]> Reviewed-by: Venky Shankar <[email protected]> Signed-off-by: Ilya Dryomov <[email protected]>
1 parent b38b17b commit a68e564

File tree

6 files changed

+93
-10
lines changed

6 files changed

+93
-10
lines changed

fs/ceph/addr.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,14 +305,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
305305
struct inode *inode = rreq->inode;
306306
struct ceph_inode_info *ci = ceph_inode(inode);
307307
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
308-
struct ceph_osd_request *req;
308+
struct ceph_osd_request *req = NULL;
309309
struct ceph_vino vino = ceph_vino(inode);
310310
struct iov_iter iter;
311311
struct page **pages;
312312
size_t page_off;
313313
int err = 0;
314314
u64 len = subreq->len;
315315

316+
if (ceph_inode_is_shutdown(inode)) {
317+
err = -EIO;
318+
goto out;
319+
}
320+
316321
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
317322
return;
318323

@@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
563568

564569
dout("writepage %p idx %lu\n", page, page->index);
565570

571+
if (ceph_inode_is_shutdown(inode))
572+
return -EIO;
573+
566574
/* verify this is a writeable snap context */
567575
snapc = page_snap_context(page);
568576
if (!snapc) {
@@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file)
16431651
struct ceph_inode_info *ci = ceph_inode(inode);
16441652
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
16451653
struct ceph_osd_request *req = NULL;
1646-
struct ceph_cap_flush *prealloc_cf;
1654+
struct ceph_cap_flush *prealloc_cf = NULL;
16471655
struct folio *folio = NULL;
16481656
u64 inline_version = CEPH_INLINE_NONE;
16491657
struct page *pages[1];
@@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file)
16571665
dout("uninline_data %p %llx.%llx inline_version %llu\n",
16581666
inode, ceph_vinop(inode), inline_version);
16591667

1668+
if (ceph_inode_is_shutdown(inode)) {
1669+
err = -EIO;
1670+
goto out;
1671+
}
1672+
16601673
if (inline_version == CEPH_INLINE_NONE)
16611674
return 0;
16621675

fs/ceph/caps.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4078,6 +4078,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
40784078
void *p, *end;
40794079
struct cap_extra_info extra_info = {};
40804080
bool queue_trunc;
4081+
bool close_sessions = false;
40814082

40824083
dout("handle_caps from mds%d\n", session->s_mds);
40834084

@@ -4215,9 +4216,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
42154216
realm = NULL;
42164217
if (snaptrace_len) {
42174218
down_write(&mdsc->snap_rwsem);
4218-
ceph_update_snap_trace(mdsc, snaptrace,
4219-
snaptrace + snaptrace_len,
4220-
false, &realm);
4219+
if (ceph_update_snap_trace(mdsc, snaptrace,
4220+
snaptrace + snaptrace_len,
4221+
false, &realm)) {
4222+
up_write(&mdsc->snap_rwsem);
4223+
close_sessions = true;
4224+
goto done;
4225+
}
42214226
downgrade_write(&mdsc->snap_rwsem);
42224227
} else {
42234228
down_read(&mdsc->snap_rwsem);
@@ -4277,6 +4282,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
42774282
iput(inode);
42784283
out:
42794284
ceph_put_string(extra_info.pool_ns);
4285+
4286+
/* Defer closing the sessions after s_mutex lock being released */
4287+
if (close_sessions)
4288+
ceph_mdsc_close_sessions(mdsc);
4289+
42804290
return;
42814291

42824292
flush_cap_releases:

fs/ceph/file.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2011,6 +2011,9 @@ static int ceph_zero_partial_object(struct inode *inode,
20112011
loff_t zero = 0;
20122012
int op;
20132013

2014+
if (ceph_inode_is_shutdown(inode))
2015+
return -EIO;
2016+
20142017
if (!length) {
20152018
op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
20162019
length = &zero;

fs/ceph/mds_client.c

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
806806
{
807807
struct ceph_mds_session *s;
808808

809+
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
810+
return ERR_PTR(-EIO);
811+
809812
if (mds >= mdsc->mdsmap->possible_max_rank)
810813
return ERR_PTR(-EINVAL);
811814

@@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
14781481
int mstate;
14791482
int mds = session->s_mds;
14801483

1484+
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
1485+
return -EIO;
1486+
14811487
/* wait for mds to go active? */
14821488
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
14831489
dout("open_session to mds%d (%s)\n", mds,
@@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc,
28602866
return;
28612867
}
28622868

2869+
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
2870+
dout("do_request metadata corrupted\n");
2871+
err = -EIO;
2872+
goto finish;
2873+
}
28632874
if (req->r_timeout &&
28642875
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
28652876
dout("do_request timed out\n");
@@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
32453256
u64 tid;
32463257
int err, result;
32473258
int mds = session->s_mds;
3259+
bool close_sessions = false;
32483260

32493261
if (msg->front.iov_len < sizeof(*head)) {
32503262
pr_err("mdsc_handle_reply got corrupt (short) reply\n");
@@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
33513363
realm = NULL;
33523364
if (rinfo->snapblob_len) {
33533365
down_write(&mdsc->snap_rwsem);
3354-
ceph_update_snap_trace(mdsc, rinfo->snapblob,
3366+
err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
33553367
rinfo->snapblob + rinfo->snapblob_len,
33563368
le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
33573369
&realm);
3370+
if (err) {
3371+
up_write(&mdsc->snap_rwsem);
3372+
close_sessions = true;
3373+
if (err == -EIO)
3374+
ceph_msg_dump(msg);
3375+
goto out_err;
3376+
}
33583377
downgrade_write(&mdsc->snap_rwsem);
33593378
} else {
33603379
down_read(&mdsc->snap_rwsem);
@@ -3412,6 +3431,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
34123431
req->r_end_latency, err);
34133432
out:
34143433
ceph_mdsc_put_request(req);
3434+
3435+
/* Defer closing the sessions after s_mutex lock being released */
3436+
if (close_sessions)
3437+
ceph_mdsc_close_sessions(mdsc);
34153438
return;
34163439
}
34173440

@@ -5011,7 +5034,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
50115034
}
50125035

50135036
/*
5014-
* called after sb is ro.
5037+
* called after sb is ro or when metadata corrupted.
50155038
*/
50165039
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
50175040
{
@@ -5301,7 +5324,8 @@ static void mds_peer_reset(struct ceph_connection *con)
53015324
struct ceph_mds_client *mdsc = s->s_mdsc;
53025325

53035326
pr_warn("mds%d closed our session\n", s->s_mds);
5304-
send_mds_reconnect(mdsc, s);
5327+
if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
5328+
send_mds_reconnect(mdsc, s);
53055329
}
53065330

53075331
static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)

fs/ceph/snap.c

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: GPL-2.0
22
#include <linux/ceph/ceph_debug.h>
33

4+
#include <linux/fs.h>
45
#include <linux/sort.h>
56
#include <linux/slab.h>
67
#include <linux/iversion.h>
@@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
766767
struct ceph_snap_realm *realm;
767768
struct ceph_snap_realm *first_realm = NULL;
768769
struct ceph_snap_realm *realm_to_rebuild = NULL;
770+
struct ceph_client *client = mdsc->fsc->client;
769771
int rebuild_snapcs;
770772
int err = -ENOMEM;
773+
int ret;
771774
LIST_HEAD(dirty_realms);
772775

773776
lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -884,6 +887,27 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
884887
if (first_realm)
885888
ceph_put_snap_realm(mdsc, first_realm);
886889
pr_err("%s error %d\n", __func__, err);
890+
891+
/*
892+
* When receiving a corrupted snap trace we don't know what
893+
* exactly has happened in MDS side. And we shouldn't continue
894+
* writing to OSD, which may corrupt the snapshot contents.
895+
*
896+
* Just try to blocklist this kclient and then this kclient
897+
* must be remounted to continue after the corrupted metadata
898+
* fixed in the MDS side.
899+
*/
900+
WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
901+
ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
902+
if (ret)
903+
pr_err("%s failed to blocklist %s: %d\n", __func__,
904+
ceph_pr_addr(&client->msgr.inst.addr), ret);
905+
906+
WARN(1, "%s: %s%sdo remount to continue%s",
907+
__func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
908+
ret ? "" : " was blocklisted, ",
909+
err == -EIO ? " after corrupted snaptrace is fixed" : "");
910+
887911
return err;
888912
}
889913

@@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
9841008
__le64 *split_inos = NULL, *split_realms = NULL;
9851009
int i;
9861010
int locked_rwsem = 0;
1011+
bool close_sessions = false;
9871012

9881013
/* decode */
9891014
if (msg->front.iov_len < sizeof(*h))
@@ -1092,8 +1117,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
10921117
* update using the provided snap trace. if we are deleting a
10931118
* snap, we can avoid queueing cap_snaps.
10941119
*/
1095-
ceph_update_snap_trace(mdsc, p, e,
1096-
op == CEPH_SNAP_OP_DESTROY, NULL);
1120+
if (ceph_update_snap_trace(mdsc, p, e,
1121+
op == CEPH_SNAP_OP_DESTROY,
1122+
NULL)) {
1123+
close_sessions = true;
1124+
goto bad;
1125+
}
10971126

10981127
if (op == CEPH_SNAP_OP_SPLIT)
10991128
/* we took a reference when we created the realm, above */
@@ -1112,6 +1141,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
11121141
out:
11131142
if (locked_rwsem)
11141143
up_write(&mdsc->snap_rwsem);
1144+
1145+
if (close_sessions)
1146+
ceph_mdsc_close_sessions(mdsc);
11151147
return;
11161148
}
11171149

fs/ceph/super.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ enum {
108108
CEPH_MOUNT_UNMOUNTED,
109109
CEPH_MOUNT_SHUTDOWN,
110110
CEPH_MOUNT_RECOVER,
111+
CEPH_MOUNT_FENCE_IO,
111112
};
112113

113114
#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8

0 commit comments

Comments
 (0)