Skip to content

Commit 76e6fb0

Browse files
committed
Merge PR ceph#53839 into main
* refs/pull/53839/head: qa: enhance test cases mds: erase clients getting evicted from laggy_clients mds: report clients laggy due laggy OSDs only after checking any OSD is laggy Reviewed-by: Venky Shankar <[email protected]>
2 parents 660fe1e + 9005451 commit 76e6fb0

File tree

3 files changed

+31
-16
lines changed

3 files changed

+31
-16
lines changed

qa/tasks/cephfs/test_client_recovery.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import string
1414
import os
1515

16+
from teuthology import contextutil
1617
from teuthology.orchestra import run
1718
from teuthology.exceptions import CommandFailedError
1819
from tasks.cephfs.fuse_mount import FuseMount
@@ -808,24 +809,27 @@ def test_client_eviction_if_config_is_set(self):
808809
# it takes time to have laggy clients entries in cluster log,
809810
# wait for 6 minutes to see if it is visible, finally restart
810811
# the client
811-
tries = 6
812-
while True:
813-
try:
814-
with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
815-
timeout=55):
816-
# make sure clients weren't evicted
817-
self.assert_session_count(2)
818-
break
819-
except AssertionError:
820-
tries -= 1
821-
if tries:
822-
continue
823-
raise
812+
with contextutil.safe_while(sleep=5, tries=6) as proceed:
813+
while proceed():
814+
try:
815+
with self.assert_cluster_log("1 client(s) laggy due to"
816+
" laggy OSDs",
817+
timeout=55):
818+
# make sure clients weren't evicted
819+
self.assert_session_count(2)
820+
break
821+
except (AssertionError, CommandFailedError) as e:
822+
log.debug(f'{e}, retrying')
823+
824+
# clear lagginess, expect to get the warning cleared and make sure
825+
# client gets evicted
826+
self.clear_laggy_params(osd)
827+
self.wait_for_health_clear(60)
828+
self.assert_session_count(1)
824829
finally:
825830
self.mount_a.kill_cleanup()
826831
self.mount_a.mount_wait()
827832
self.mount_a.create_destroy()
828-
self.clear_laggy_params(osd)
829833

830834
def test_client_eviction_if_config_is_unset(self):
831835
"""
@@ -857,6 +861,11 @@ def test_client_eviction_if_config_is_unset(self):
857861

858862
time.sleep(session_timeout)
859863
self.assert_session_count(1)
864+
865+
# make sure warning wasn't seen in cluster log
866+
with self.assert_cluster_log("laggy due to laggy OSDs",
867+
timeout=120, present=False):
868+
pass
860869
finally:
861870
self.mount_a.kill_cleanup()
862871
self.mount_a.mount_wait()

src/mds/Beacon.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -500,13 +500,17 @@ void Beacon::notify_health(MDSRank const *mds)
500500

501501
// Report laggy client(s) due to laggy OSDs
502502
{
503+
bool defer_client_eviction =
504+
g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds")
505+
&& mds->objecter->with_osdmap([](const OSDMap &map) {
506+
return map.any_osd_laggy(); });
503507
auto&& laggy_clients = mds->server->get_laggy_clients();
504-
if (!laggy_clients.empty()) {
508+
if (defer_client_eviction && !laggy_clients.empty()) {
505509
std::vector<MDSHealthMetric> laggy_clients_metrics;
506510
for (const auto& laggy_client: laggy_clients) {
507511
CachedStackStringStream css;
508512
*css << "Client " << laggy_client << " is laggy; not evicted"
509-
<< " because some OSD(s) is/are laggy";
513+
<< " because some OSD(s) is/are laggy";
510514
MDSHealthMetric m(MDS_HEALTH_CLIENTS_LAGGY, HEALTH_WARN, css->strv());
511515
laggy_clients_metrics.emplace_back(std::move(m));
512516
}

src/mds/Server.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,8 @@ void Server::find_idle_sessions()
12771277
kill_session(session, NULL);
12781278
}
12791279
}
1280+
// clear as there's no use to keep the evicted clients in laggy_clients
1281+
clear_laggy_clients();
12801282
}
12811283

12821284
void Server::evict_cap_revoke_non_responders() {

0 commit comments

Comments
 (0)