Skip to content

Commit 100415d

Browse files
committed
Merge PR ceph#64663 into main
* refs/pull/64663/head: Reviewed-by: Venky Shankar <[email protected]> Reviewed-by: Patrick Donnelly <[email protected]>
2 parents 16126b1 + 8547e57 commit 100415d

File tree

7 files changed

+139
-8
lines changed

7 files changed

+139
-8
lines changed

qa/cephfs/clusters/1-node-1-mds-1-osd.yaml renamed to qa/cephfs/clusters/1-node-4-mds-1-osd.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
roles:
2-
- [mon.a, mgr.x, mds.a, osd.0, client.0]
2+
- [mon.a, mgr.x, mds.a, mds.b, mds.c, mds.d, osd.0, client.0]
33
openstack:
44
- volumes: # attached to each instance
55
count: 1

qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.qa/cephfs/clusters/1-node-4-mds-1-osd.yaml

qa/suites/fs/full/overrides.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ overrides:
1717
- OSD_OUT_OF_ORDER_FULL
1818
- OSD_NEARFULL
1919
- OSD_FULL
20+
- MGR_DOWN

qa/suites/fs/full/tasks/mgr-osd-full.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,8 @@ tasks:
2929
clients:
3030
client.0:
3131
- fs/full/subvolume_snapshot_rm.sh
32+
- workunit:
33+
cleanup: true
34+
clients:
35+
client.0:
36+
- fs/full/subvolume_ls.sh
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/usr/bin/env bash
2+
set -ex
3+
4+
# This testcase tests the scenario of the 'ceph fs subvolume ls' mgr command
5+
# when the osd is full. The command used to miss out few subvolumes in the list.
6+
# The issue happens in the multi-mds active setup. Please see the tracker
7+
# https://tracker.ceph.com/issues/72260
8+
9+
# The suite sets the 'bluestore block size' to 2GiB. So, the osd is of the
10+
# size 2GiB. The 25 subvolumes are created and a 1GB file is written on the
11+
# root. The full-ratios are set such that, the data less than 500MB is
12+
# treated as osd full. Now, subvolumes are listed 20 times with mgr failover
13+
# (to invalidate readdir cache) and validated each time.
14+
15+
SUBVOL_CNT=25
16+
17+
expect_failure() {
18+
if "$@"; then return 1; else return 0; fi
19+
}
20+
validate_subvol_cnt() {
21+
if [ $1 -eq $SUBVOL_CNT ]; then return 0; else return 1; fi
22+
}
23+
restart_mgr() {
24+
ceph mgr fail x
25+
timeout=30
26+
while [ $timeout -gt 0 ]
27+
do
28+
active_mgr_cnt=$(ceph status | grep mgr | grep active | grep -v no | wc -l)
29+
if [ $active_mgr_cnt -eq 1 ]; then break; fi
30+
echo "Waiting for mgr to be active after failover: $timeout"
31+
sleep 1
32+
let "timeout-=1"
33+
done
34+
}
35+
36+
#Set client_use_random_mds
37+
ceph config set client client_use_random_mds true
38+
39+
#Set max_mds to 3
40+
ceph fs set cephfs max_mds 3
41+
timeout=30
42+
while [ $timeout -gt 0 ]
43+
do
44+
active_cnt=$(ceph fs status | grep active | wc -l)
45+
if [ $active_cnt -eq 2 ]; then break; fi
46+
echo "Wating for max_mds to be 2: $timeout"
47+
sleep 1
48+
let "timeout-=1"
49+
done
50+
51+
#Create subvolumes
52+
for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume create cephfs sub_$i; done
53+
54+
#For debugging
55+
echo "Before write"
56+
df -h
57+
ceph osd df
58+
59+
sudo dd if=/dev/urandom of=$CEPH_MNT/1GB_file-1 status=progress bs=1M count=1000
60+
61+
# The suite (qa/suites/fs/full/tasks/mgr-osd-full.yaml) sets the 'bluestore block size'
62+
# to 2GiB. So, the osd is of the size 2GiB. The full-ratios are set below makes sure
63+
# that the data less than 500MB is treated as osd full.
64+
ceph osd set-full-ratio 0.2
65+
ceph osd set-nearfull-ratio 0.16
66+
ceph osd set-backfillfull-ratio 0.18
67+
68+
timeout=30
69+
while [ $timeout -gt 0 ]
70+
do
71+
health=$(ceph health detail)
72+
[[ $health = *"OSD_FULL"* ]] && echo "OSD is full" && break
73+
echo "Waiting for osd to be full: $timeout"
74+
sleep 1
75+
let "timeout-=1"
76+
done
77+
78+
#For debugging
79+
echo "After ratio set"
80+
df -h
81+
ceph osd df
82+
83+
#Clear readdir cache by failing over mgr which forces to use new libcephfs connection
84+
#Validate subvolume ls 20 times
85+
for i in {1..20};
86+
do
87+
restart_mgr
88+
#List and validate subvolumes count
89+
subvol_cnt=$(ceph fs subvolume ls cephfs --format=json-pretty | grep sub_ | wc -l)
90+
validate_subvol_cnt $subvol_cnt
91+
done
92+
93+
#Delete all subvolumes
94+
for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume rm cephfs sub_$i; done
95+
96+
#Wait for subvolume to delete data
97+
trashdir=$CEPH_MNT/volumes/_deleting
98+
timeout=30
99+
while [ $timeout -gt 0 ]
100+
do
101+
[ -z "$(sudo ls -A $trashdir)" ] && echo "Trash directory $trashdir is empty" && break
102+
echo "Waiting for trash dir to be empty: $timeout"
103+
sleep 1
104+
let "timeout-=1"
105+
done
106+
107+
sudo rm -f $CEPH_MNT/1GB_file-1
108+
109+
#Set the ratios back for other full tests to run
110+
ceph osd set-full-ratio 0.95
111+
ceph osd set-nearfull-ratio 0.95
112+
ceph osd set-backfillfull-ratio 0.95
113+
114+
#After test
115+
echo "After test"
116+
df -h
117+
ceph osd df
118+
119+
echo OK

src/mds/Server.cc

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2783,11 +2783,6 @@ void Server::dispatch_client_request(const MDRequestRef& mdr)
27832783
}
27842784

27852785
if (is_full) {
2786-
CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2787-
if (!cur) {
2788-
// the request is already responded to
2789-
return;
2790-
}
27912786
if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
27922787
req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
27932788
req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
@@ -2800,7 +2795,18 @@ void Server::dispatch_client_request(const MDRequestRef& mdr)
28002795
req->get_op() == CEPH_MDS_OP_RENAME) &&
28012796
(!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
28022797
) {
2803-
2798+
/*
2799+
* The inode fetch below is specific to the operations above and the inode is
2800+
* expected to be in memory as these operations are likely preceded by lookup.
2801+
* Doing this generically outside the condition was incorrect as the ops like
2802+
* getattr might not have the inode in memory as this could be a non-auth mds
2803+
* and fails with ESTALE confusing the client without forwarding to the auth mds.
2804+
*/
2805+
CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2806+
if (!cur) {
2807+
// the request is already responded to
2808+
return;
2809+
}
28042810
if (check_access(mdr, cur, MAY_FULL)) {
28052811
dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
28062812
} else {

0 commit comments

Comments
 (0)