Skip to content

Commit 3fb2db6

Browse files
committed
Merge PR ceph#61547 into main
* refs/pull/61547/head: qa: enable libcephfs debug logs for fio workload with nfs Reviewed-by: Patrick Donnelly <[email protected]> Reviewed-by: Venky Shankar <[email protected]>
2 parents 5da9c56 + 268cd91 commit 3fb2db6

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

qa/tasks/cephfs/test_nfs.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import time
55
import logging
66
from io import BytesIO, StringIO
7+
import yaml
78

89
from tasks.mgr.mgr_test_case import MgrTestCase
910
from teuthology import contextutil
@@ -497,6 +498,121 @@ def update_export(self, cluster_id, path, pseudo, fs_name):
497498
}
498499
}))
499500

501+
def apply_ganesha_spec(self, spec):
502+
"""
503+
apply spec and wait for redeploy otherwise it will reset any conf changes
504+
:param spec: ganesha daemon spec (YAML)
505+
"""
506+
ganesha_daemon_pid_init = (self.ctx.cluster.run(args=["sudo", "pgrep", "ganesha.nfsd"],
507+
stdout=StringIO(),
508+
stderr=StringIO()))[0].stdout.getvalue().strip()
509+
self.ctx.cluster.run(args=['ceph', 'orch', 'apply', '-i', '-'],
510+
stdin=spec)
511+
with contextutil.safe_while(sleep=4, tries=15) as proceed:
512+
while proceed():
513+
try:
514+
ganesha_daemon_pid = (self.ctx.cluster.run(args=["sudo", "pgrep", "ganesha.nfsd"],
515+
stdout=StringIO(),
516+
stderr=StringIO()))[0].stdout.getvalue().strip()
517+
if ganesha_daemon_pid != ganesha_daemon_pid_init:
518+
# new pid i.e. redeployment done
519+
break
520+
except CommandFailedError:
521+
# no pid if the redeployment is in progress
522+
log.info('waiting for ganesha daemon redeployment')
523+
524+
def enable_libcephfs_logging(self, cluster_name):
525+
"""
526+
enable ceph client logs by adding a volume mount to ganesha daemon's
527+
unit.run using `ceph orch apply -i <spec>` and adding client log path
528+
to /var/lib/ceph/{fsid}/{ganesha_daemon}/config
529+
:param cluster_name: nfs cluster name
530+
"""
531+
fsid = self._cmd("fsid").strip()
532+
533+
# add volume mount for ceph client logging from /var/log/ceph/$fsid:/var/log/ceph:z
534+
ganesha_spec = self._cmd("orch", "ls", "--service-name",
535+
f"nfs.{cluster_name}", "--export").strip()
536+
parsed_ganesha_spec = yaml.safe_load(ganesha_spec)
537+
original_ganesha_spec = yaml.dump(parsed_ganesha_spec)
538+
parsed_ganesha_spec["extra_container_args"] = ["-v",
539+
f"/var/log/ceph/{fsid}:/var/log/ceph:z"]
540+
debug_enabled_ganesha_spec = yaml.dump(parsed_ganesha_spec).replace("- -v", '- "-v"').replace(
541+
f"- /var/log/ceph/{fsid}:/var/log/ceph:z", f'- "/var/log/ceph/{fsid}:/var/log/ceph:z"')
542+
log.debug(f"debug enabled ganesha spec: {debug_enabled_ganesha_spec}")
543+
544+
self.apply_ganesha_spec(debug_enabled_ganesha_spec)
545+
546+
# add client debug to /var/lib/ceph/$fsid/$ganesha_daemon/config
547+
ganesha_daemon = ((self._orch_cmd("ps", "--daemon-type", "nfs")).split("\n")[1].split(' ')[0]).strip()
548+
GANESHA_CONF_FILE_PATH = f"/var/lib/ceph/{fsid}/{ganesha_daemon}/config"
549+
550+
original_ganesha_conf = (self.ctx.cluster.run(args=["sudo", "cat", GANESHA_CONF_FILE_PATH],
551+
stdout=StringIO(),
552+
stderr=StringIO()))[0].stdout.getvalue().strip()
553+
if "[client]" not in original_ganesha_conf:
554+
s = f"[client]\n\tdebug client = 20\n\tlog file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log"
555+
self._sys_cmd(["echo", Raw(f'"{s}"'), Raw("|"), "sudo", "tee", Raw("-a"), GANESHA_CONF_FILE_PATH])
556+
# restart ganesha daemon for the changes to take effect
557+
self._orch_cmd("restart", f"nfs.{cluster_name}")
558+
559+
# ensure log level and file path exists
560+
ganesha_conf_debug_enabled = (self.ctx.cluster.run(args=["sudo", "cat", GANESHA_CONF_FILE_PATH],
561+
stdout=StringIO(),
562+
stderr=StringIO()))[0].stdout.getvalue().strip()
563+
self.assertIn("[client]", ganesha_conf_debug_enabled)
564+
self.assertIn("debug client = 20", ganesha_conf_debug_enabled)
565+
self.assertIn(f"log file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log",
566+
ganesha_conf_debug_enabled)
567+
568+
def check_libcephfs_log():
569+
LIBCEPHFS_LOG_FILE_PATH = f"/var/log/ceph/{fsid}/ceph-client.nfs.{cluster_name}.log"
570+
libcephfs_log = (self.ctx.cluster.run(args=["sudo", "cat",
571+
LIBCEPHFS_LOG_FILE_PATH,
572+
Raw("|"), "tail", "-n", "2"],
573+
check_status=False,
574+
stdout=StringIO(),
575+
stderr=StringIO()))
576+
if libcephfs_log[0].returncode != 0:
577+
log.debug(f"failed to read {LIBCEPHFS_LOG_FILE_PATH}, retrying")
578+
return False
579+
if len(libcephfs_log[0].stdout.getvalue().strip()) == 0:
580+
log.debug(f"log file {LIBCEPHFS_LOG_FILE_PATH} empty, retrying")
581+
return False
582+
return True
583+
584+
# usually appears in no time, sometimes might take a second or two for the log file to appear
585+
self.wait_until_true(check_libcephfs_log, timeout=60)
586+
587+
return original_ganesha_spec, GANESHA_CONF_FILE_PATH, original_ganesha_conf
588+
589+
def disable_libcephfs_logging(self, cluster_name, ganesha_spec, conf_path, ganesha_conf):
590+
"""
591+
disable ceph client logs by reverting back to the primary ganesha spec and removing debug level
592+
and file path from /var/lib/ceph/{fsid}/{ganesha_daemon}/config
593+
:param cluster_name: nfs cluster name
594+
:param ganesha_spec: primary spec (spec prior to adding debug volume mount)
595+
:param conf_path: ganesha conf file path
596+
:param ganesha_conf: primary ganesha conf (conf prior to adding debug level and path)
597+
"""
598+
self.apply_ganesha_spec(ganesha_spec)
599+
600+
# remove ceph client debug info from ganesha conf
601+
conf_content = (self.ctx.cluster.run(args=["sudo", "cat", conf_path],
602+
stdout=StringIO(),
603+
stderr=StringIO()))[0].stdout.getvalue().strip()
604+
if "[client]" in conf_content:
605+
self.ctx.cluster.run(args=['sudo', 'truncate', Raw("-s"), "0", conf_path])
606+
self._sys_cmd(["echo", Raw(f'"{ganesha_conf}"'), Raw("|"), "sudo", "tee", conf_path])
607+
default_conf = (self.ctx.cluster.run(args=["sudo", "cat", conf_path],
608+
stdout=StringIO(),
609+
stderr=StringIO()))[0].stdout.getvalue().strip()
610+
self.assertNotIn("[client]", default_conf)
611+
self.assertNotIn("debug client = 20", default_conf)
612+
self.assertNotIn(f"log file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log", default_conf)
613+
# restart ganesha daemon for the changes to take effect
614+
self._orch_cmd("restart", f"nfs.{cluster_name}")
615+
500616
def test_create_and_delete_cluster(self):
501617
'''
502618
Test successful creation and deletion of the nfs cluster.
@@ -681,11 +797,13 @@ def test_async_io_fio(self):
681797
Test async io using fio. Expect completion without hang or crash
682798
'''
683799
self._test_create_cluster()
800+
ganesha_spec, conf_path, conf = self.enable_libcephfs_logging(self.cluster_id)
684801
self._create_export(export_id='1', create_fs=True,
685802
extra_cmd=['--pseudo-path', self.pseudo_path])
686803
port, ip = self._get_port_ip_info()
687804
self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
688805
self._test_fio(self.pseudo_path, port, ip)
806+
self.disable_libcephfs_logging(self.cluster_id, ganesha_spec, conf_path, conf)
689807
self._test_delete_cluster()
690808

691809
def test_cluster_info(self):

0 commit comments

Comments
 (0)