Skip to content

Commit 268cd91

Browse files
committed
qa: enable libcephfs debug logs for fio workload with nfs
this acts as a stop gap to get libcephfs logs in teuthology runs while https://tracker.ceph.com/issues/69895 is being discussed and implemented. Signed-off-by: Dhairya Parmar <[email protected]>
1 parent fe3413f commit 268cd91

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

qa/tasks/cephfs/test_nfs.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import time
55
import logging
66
from io import BytesIO, StringIO
7+
import yaml
78

89
from tasks.mgr.mgr_test_case import MgrTestCase
910
from teuthology import contextutil
@@ -487,6 +488,121 @@ def update_export(self, cluster_id, path, pseudo, fs_name):
487488
}
488489
}))
489490

491+
def apply_ganesha_spec(self, spec):
492+
"""
493+
apply spec and wait for redeploy otherwise it will reset any conf changes
494+
:param spec: ganesha daemon spec (YAML)
495+
"""
496+
ganesha_daemon_pid_init = (self.ctx.cluster.run(args=["sudo", "pgrep", "ganesha.nfsd"],
497+
stdout=StringIO(),
498+
stderr=StringIO()))[0].stdout.getvalue().strip()
499+
self.ctx.cluster.run(args=['ceph', 'orch', 'apply', '-i', '-'],
500+
stdin=spec)
501+
with contextutil.safe_while(sleep=4, tries=15) as proceed:
502+
while proceed():
503+
try:
504+
ganesha_daemon_pid = (self.ctx.cluster.run(args=["sudo", "pgrep", "ganesha.nfsd"],
505+
stdout=StringIO(),
506+
stderr=StringIO()))[0].stdout.getvalue().strip()
507+
if ganesha_daemon_pid != ganesha_daemon_pid_init:
508+
# new pid i.e. redeployment done
509+
break
510+
except CommandFailedError:
511+
# no pid if the redeployment is in progress
512+
log.info('waiting for ganesha daemon redeployment')
513+
514+
def enable_libcephfs_logging(self, cluster_name):
515+
"""
516+
enable ceph client logs by adding a volume mount to ganesha daemon's
517+
unit.run using `ceph orch apply -i <spec>` and adding client log path
518+
to /var/lib/ceph/{fsid}/{ganesha_daemon}/config
519+
:param cluster_name: nfs cluster name
520+
"""
521+
fsid = self._cmd("fsid").strip()
522+
523+
# add volume mount for ceph client logging from /var/log/ceph/$fsid:/var/log/ceph:z
524+
ganesha_spec = self._cmd("orch", "ls", "--service-name",
525+
f"nfs.{cluster_name}", "--export").strip()
526+
parsed_ganesha_spec = yaml.safe_load(ganesha_spec)
527+
original_ganesha_spec = yaml.dump(parsed_ganesha_spec)
528+
parsed_ganesha_spec["extra_container_args"] = ["-v",
529+
f"/var/log/ceph/{fsid}:/var/log/ceph:z"]
530+
debug_enabled_ganesha_spec = yaml.dump(parsed_ganesha_spec).replace("- -v", '- "-v"').replace(
531+
f"- /var/log/ceph/{fsid}:/var/log/ceph:z", f'- "/var/log/ceph/{fsid}:/var/log/ceph:z"')
532+
log.debug(f"debug enabled ganesha spec: {debug_enabled_ganesha_spec}")
533+
534+
self.apply_ganesha_spec(debug_enabled_ganesha_spec)
535+
536+
# add client debug to /var/lib/ceph/$fsid/$ganesha_daemon/config
537+
ganesha_daemon = ((self._orch_cmd("ps", "--daemon-type", "nfs")).split("\n")[1].split(' ')[0]).strip()
538+
GANESHA_CONF_FILE_PATH = f"/var/lib/ceph/{fsid}/{ganesha_daemon}/config"
539+
540+
original_ganesha_conf = (self.ctx.cluster.run(args=["sudo", "cat", GANESHA_CONF_FILE_PATH],
541+
stdout=StringIO(),
542+
stderr=StringIO()))[0].stdout.getvalue().strip()
543+
if "[client]" not in original_ganesha_conf:
544+
s = f"[client]\n\tdebug client = 20\n\tlog file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log"
545+
self._sys_cmd(["echo", Raw(f'"{s}"'), Raw("|"), "sudo", "tee", Raw("-a"), GANESHA_CONF_FILE_PATH])
546+
# restart ganesha daemon for the changes to take effect
547+
self._orch_cmd("restart", f"nfs.{cluster_name}")
548+
549+
# ensure log level and file path exists
550+
ganesha_conf_debug_enabled = (self.ctx.cluster.run(args=["sudo", "cat", GANESHA_CONF_FILE_PATH],
551+
stdout=StringIO(),
552+
stderr=StringIO()))[0].stdout.getvalue().strip()
553+
self.assertIn("[client]", ganesha_conf_debug_enabled)
554+
self.assertIn("debug client = 20", ganesha_conf_debug_enabled)
555+
self.assertIn(f"log file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log",
556+
ganesha_conf_debug_enabled)
557+
558+
def check_libcephfs_log():
559+
LIBCEPHFS_LOG_FILE_PATH = f"/var/log/ceph/{fsid}/ceph-client.nfs.{cluster_name}.log"
560+
libcephfs_log = (self.ctx.cluster.run(args=["sudo", "cat",
561+
LIBCEPHFS_LOG_FILE_PATH,
562+
Raw("|"), "tail", "-n", "2"],
563+
check_status=False,
564+
stdout=StringIO(),
565+
stderr=StringIO()))
566+
if libcephfs_log[0].returncode != 0:
567+
log.debug(f"failed to read {LIBCEPHFS_LOG_FILE_PATH}, retrying")
568+
return False
569+
if len(libcephfs_log[0].stdout.getvalue().strip()) == 0:
570+
log.debug(f"log file {LIBCEPHFS_LOG_FILE_PATH} empty, retrying")
571+
return False
572+
return True
573+
574+
# usually appears in no time, sometimes might take a second or two for the log file to appear
575+
self.wait_until_true(check_libcephfs_log, timeout=60)
576+
577+
return original_ganesha_spec, GANESHA_CONF_FILE_PATH, original_ganesha_conf
578+
579+
def disable_libcephfs_logging(self, cluster_name, ganesha_spec, conf_path, ganesha_conf):
580+
"""
581+
disable ceph client logs by reverting back to the primary ganesha spec and removing debug level
582+
and file path from /var/lib/ceph/{fsid}/{ganesha_daemon}/config
583+
:param cluster_name: nfs cluster name
584+
:param ganesha_spec: primary spec (spec prior to adding debug volume mount)
585+
:param conf_path: ganesha conf file path
586+
:param ganesha_conf: primary ganesha conf (conf prior to adding debug level and path)
587+
"""
588+
self.apply_ganesha_spec(ganesha_spec)
589+
590+
# remove ceph client debug info from ganesha conf
591+
conf_content = (self.ctx.cluster.run(args=["sudo", "cat", conf_path],
592+
stdout=StringIO(),
593+
stderr=StringIO()))[0].stdout.getvalue().strip()
594+
if "[client]" in conf_content:
595+
self.ctx.cluster.run(args=['sudo', 'truncate', Raw("-s"), "0", conf_path])
596+
self._sys_cmd(["echo", Raw(f'"{ganesha_conf}"'), Raw("|"), "sudo", "tee", conf_path])
597+
default_conf = (self.ctx.cluster.run(args=["sudo", "cat", conf_path],
598+
stdout=StringIO(),
599+
stderr=StringIO()))[0].stdout.getvalue().strip()
600+
self.assertNotIn("[client]", default_conf)
601+
self.assertNotIn("debug client = 20", default_conf)
602+
self.assertNotIn(f"log file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log", default_conf)
603+
# restart ganesha daemon for the changes to take effect
604+
self._orch_cmd("restart", f"nfs.{cluster_name}")
605+
490606
def test_create_and_delete_cluster(self):
491607
'''
492608
Test successful creation and deletion of the nfs cluster.
@@ -671,11 +787,13 @@ def test_async_io_fio(self):
671787
Test async io using fio. Expect completion without hang or crash
672788
'''
673789
self._test_create_cluster()
790+
ganesha_spec, conf_path, conf = self.enable_libcephfs_logging(self.cluster_id)
674791
self._create_export(export_id='1', create_fs=True,
675792
extra_cmd=['--pseudo-path', self.pseudo_path])
676793
port, ip = self._get_port_ip_info()
677794
self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
678795
self._test_fio(self.pseudo_path, port, ip)
796+
self.disable_libcephfs_logging(self.cluster_id, ganesha_spec, conf_path, conf)
679797
self._test_delete_cluster()
680798

681799
def test_cluster_info(self):

0 commit comments

Comments
 (0)