Skip to content

Commit 1d5479e

Browse files
jayaragini-hclPattela JAYARAGINI
authored andcommitted
gNOI: Add backend support for Healthz
1 parent 6507fa3 commit 1d5479e

File tree

5 files changed

+670
-15
lines changed

5 files changed

+670
-15
lines changed

host_modules/debug_info.py

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
"""Debug Info.
2+
3+
This host service module implements the backend support for
4+
collecting host debug artifacts.Depending on the log level
5+
and board type input,the relevant log files, DB snapshots,
6+
counters, record files,and various command outputs are collected
7+
and aggregated under a specified artifact directory and the directory is
8+
compressed to a *.tar.gz in the host.
9+
10+
As part of the SONiC supported common debug commands,below are the list of files.
11+
core,log,db,counter files,routing.txt and version.txt
12+
"""
13+
14+
from datetime import datetime
15+
import json
16+
import logging
17+
import os
18+
import shutil
19+
import subprocess
20+
import time
21+
22+
from host_modules import host_service
23+
# Import SONiC debug commands for SONiC platform.
24+
from utils.sonic_debug_cmds import *
25+
26+
MOD_NAME = "debug_info"
27+
ARTIFACT_DIR = "/tmp/dump"
28+
NONVOLATILE_PARTITION = "/var/log/"
29+
NONVOLATILE_ARTIFACT_DIR = "/var/log/dump"
30+
NONVOLATILE_STORAGE_REQUIRED = 5 * 10**8
31+
NONVOLATILE_TMP_FLAG = "/tmp/nonvolatile_saved"
32+
ARTIFACT_DIR_CONTAINER = "/var/dump"
33+
ARTIFACT_DIR_HOST = "host"
34+
CORE_DIR = "core"
35+
DB_ARTIFACT_DIR = ARTIFACT_DIR_HOST + "/db"
36+
ARTIFACT_LEVEL_ALERT = "alert"
37+
ARTIFACT_LEVEL_CRITICAL = "critical"
38+
ARTIFACT_LEVEL_ALL = "all"
39+
LOG_LEVEL_KEY = "level"
40+
PERSISTENT_STORAGE_KEY = "use_persistent_storage"
41+
42+
STATE_DB_SEPARATOR = "|"
43+
DEBUG_INFO_FLAG = "debug_info"
44+
45+
log_dir = "/var/log"
46+
os.makedirs(log_dir, exist_ok=True)
47+
48+
log_file = os.path.join(log_dir, "debug_info.log")
49+
logging.basicConfig(
50+
filename=log_file,
51+
filemode='a', # append mode
52+
format='%(asctime)s - %(levelname)s - %(message)s',
53+
level=logging.DEBUG
54+
)
55+
56+
logger = logging.getLogger(__name__)
57+
58+
class DebugInfo(host_service.HostModule):
59+
"""DBus endpoint that collects debug artifacts."""
60+
61+
def __init__(self, mod_name):
62+
self._board_type = DebugInfo.get_board_type()
63+
self._hostname = DebugInfo.get_hostname()
64+
super(DebugInfo, self).__init__(mod_name)
65+
66+
@staticmethod
67+
def _run_command(cmd: str, timeout: int = 20):
68+
proc = subprocess.Popen(
69+
cmd,
70+
shell=True,
71+
text=True,
72+
stdout=subprocess.PIPE,
73+
stderr=subprocess.PIPE,
74+
close_fds=True)
75+
try:
76+
stdout, stderr = proc.communicate(timeout=timeout)
77+
except subprocess.TimeoutExpired:
78+
proc.kill()
79+
return 1, "command timeout", "command timeout"
80+
return proc.returncode, stdout, stderr
81+
82+
@staticmethod
83+
def get_board_type() -> str:
84+
rc, stdout, err = DebugInfo._run_command(BOARD_TYPE_CMD)
85+
board_type = ""
86+
if rc != 0:
87+
logger.warning("fail to execute command '%s': %s", BOARD_TYPE_CMD, err)
88+
else:
89+
board_type = stdout.strip()
90+
return board_type
91+
92+
@staticmethod
93+
def get_hostname() -> str:
94+
cmd = "hostname"
95+
rc, stdout, err = DebugInfo._run_command(cmd)
96+
hostname = "switch"
97+
if rc != 0:
98+
logger.warning("fail to execute command '%s': %s", cmd, err)
99+
else:
100+
hostname = stdout.strip()
101+
return hostname
102+
103+
@staticmethod
104+
def _collect_counter_artifacts(directory: str, prefix: str,
105+
board_type: str) -> None:
106+
counter_artifact_dir = os.path.join(
107+
directory,
108+
datetime.now().strftime(prefix + "counter_%Y%m%d_%H%M%S"))
109+
os.makedirs(counter_artifact_dir, exist_ok=True)
110+
111+
for cmd in COUNTER_CMDS:
112+
rc, _, err = DebugInfo._run_command(cmd.format(counter_artifact_dir), timeout=60)
113+
if rc != 0:
114+
# Continue the artifact collection in case of error.
115+
logger.warning("fail to execute command '%s': %s", cmd, err)
116+
117+
@staticmethod
118+
def _collect_teamdctl_data(artifact_dir_host):
119+
try:
120+
redis_result = subprocess.run(
121+
REDIS_LIST_PORTCHANNEL_CMD, shell=True, capture_output=True, text=True, check=True)
122+
trunks = redis_result.stdout.strip().split('\n')
123+
for trunk in trunks:
124+
try:
125+
trk = trunk.split('|')[1]
126+
except IndexError:
127+
# No trunk is found in the DB or the trunk table format is incorrect.
128+
continue
129+
teamdctl_cmd = TEAMD_CTL_CMD.format(trk)
130+
teamdctl_result = subprocess.run(
131+
teamdctl_cmd, shell=True, capture_output=True, text=True)
132+
if teamdctl_result.returncode == 0:
133+
filepath = os.path.join(artifact_dir_host, f'teamdctl_{trk}.txt')
134+
try:
135+
with open(filepath, 'w') as f:
136+
f.write(teamdctl_result.stdout)
137+
# If the filepath is invalid, then just return silently. If the
138+
# filepath is valid, the file will be created.
139+
except FileNotFoundError:
140+
return
141+
else:
142+
logger.warning(
143+
f"Error running teamdctl for {trk}: {teamdctl_result.stderr}")
144+
except subprocess.CalledProcessError as e:
145+
logger.warning(f"Error running Redis command: {e}")
146+
147+
@staticmethod
148+
def _save_persistent_storage(artifact_name: str) -> None:
149+
if os.path.isfile(NONVOLATILE_TMP_FLAG):
150+
logger.warning(
151+
"%s already exists, skipping saving artifacts to "
152+
"persistent storage", NONVOLATILE_TMP_FLAG)
153+
return
154+
try:
155+
with open(NONVOLATILE_TMP_FLAG, "w+"):
156+
pass
157+
except OSError as e:
158+
logger.warning("error creating flag in tmp: %s. Error: %s",
159+
NONVOLATILE_TMP_FLAG, str(e))
160+
return
161+
162+
host_artifact_name = ARTIFACT_DIR + "/" + artifact_name
163+
shutil.rmtree(NONVOLATILE_ARTIFACT_DIR, ignore_errors=True)
164+
try:
165+
artifact_size = os.path.getsize(host_artifact_name)
166+
except OSError:
167+
logger.warning("path %s did not exist", host_artifact_name)
168+
return
169+
170+
_, _, free = shutil.disk_usage(NONVOLATILE_PARTITION)
171+
if free < NONVOLATILE_STORAGE_REQUIRED + artifact_size:
172+
logger.warning(
173+
"free space remaining on %s is less than %d: %d. Not saving "
174+
"artifacts to persistent storage", NONVOLATILE_PARTITION,
175+
NONVOLATILE_STORAGE_REQUIRED + artifact_size, free)
176+
return
177+
178+
os.makedirs(NONVOLATILE_ARTIFACT_DIR, exist_ok=True)
179+
180+
cmd = (
181+
f"cp {host_artifact_name} {NONVOLATILE_ARTIFACT_DIR}/{artifact_name}")
182+
183+
rc, _, err = DebugInfo._run_command(cmd)
184+
if rc != 0:
185+
# Report success overall if saving to persistent storage fails, saving
186+
# to persistent storage is best-effort.
187+
logger.warning("fail to execute command '%s': %s", cmd, err)
188+
189+
@staticmethod
190+
def collect_artifacts(req: str, timestamp: str, board_type: str,
191+
hostname: str):
192+
"""Collect all artifacts for a given board type.
193+
194+
Currently only host-level and DB artifcats are collected.
195+
Component-level (e.g., gnmi/orch) artifact collection is not supported
196+
197+
This method can also be called by the CLI.
198+
199+
Args:
200+
req: = string, a single JSON string that contains the log level,
201+
and optional with persistent_storage flag to indicate if the artifacst should
202+
be stored in persistent storage, in addition to volatile storage
203+
timestamp: = string, a timestamp string that is used in the artifact name.
204+
board_type: = string, a string representation of the board type.
205+
hostname: = string, the hostname of the device, used to name the output
206+
directory.
207+
208+
Returns:
209+
string: a return code and a return string to indicate the output artifact
210+
in the host.
211+
"""
212+
try:
213+
request = json.loads(req)
214+
except json.JSONDecodeError:
215+
return 1, "invalid input: " + req
216+
log_level = request.get(LOG_LEVEL_KEY, ARTIFACT_LEVEL_ALERT)
217+
use_persistent_storage = request.get(
218+
PERSISTENT_STORAGE_KEY) if PERSISTENT_STORAGE_KEY in request else False
219+
220+
dir_name = hostname + "_" + timestamp
221+
artifact_dir_host = os.path.join(ARTIFACT_DIR, dir_name, ARTIFACT_DIR_HOST)
222+
db_artifact_dir = os.path.join(ARTIFACT_DIR, dir_name, DB_ARTIFACT_DIR)
223+
224+
os.makedirs(artifact_dir_host, exist_ok=True)
225+
226+
# Collect counter artifacts at the beginning of the collection.
227+
if log_level == ARTIFACT_LEVEL_CRITICAL or log_level == ARTIFACT_LEVEL_ALL:
228+
DebugInfo._collect_counter_artifacts(artifact_dir_host, "pre_",
229+
board_type)
230+
231+
for cmd in COMMON_CMDS:
232+
rc, _, err = DebugInfo._run_command(cmd.format(artifact_dir_host))
233+
if rc != 0:
234+
# Continue the artifact collection in case of error.
235+
logger.warning("fail to execute command '%s': %s", cmd, err)
236+
237+
# create host/core dir if it does not exist
238+
os.makedirs(artifact_dir_host + "/" + CORE_DIR, exist_ok=True)
239+
240+
DebugInfo._collect_teamdctl_data(artifact_dir_host)
241+
242+
if log_level == ARTIFACT_LEVEL_CRITICAL or log_level == ARTIFACT_LEVEL_ALL:
243+
os.makedirs(db_artifact_dir, exist_ok=True)
244+
for cmd in DB_CMDS:
245+
rc, _, err = DebugInfo._run_command(cmd.format(db_artifact_dir), timeout=60)
246+
if rc != 0:
247+
# Continue the artifact collection in case of error.
248+
logger.warning("fail to execute command '%s': %s", cmd, err)
249+
250+
# Collect counter artifacts at the end of the collection.
251+
if log_level == ARTIFACT_LEVEL_CRITICAL or log_level == ARTIFACT_LEVEL_ALL:
252+
DebugInfo._collect_counter_artifacts(artifact_dir_host, "post_",
253+
board_type)
254+
255+
artifact_name = dir_name + ".tar.gz"
256+
host_artifact_name = ARTIFACT_DIR + "/" + artifact_name
257+
258+
cmd = ("tar -C " + ARTIFACT_DIR + " -zcvf " + host_artifact_name + " " +
259+
dir_name)
260+
261+
rc, _, err = DebugInfo._run_command(cmd, timeout=60)
262+
shutil.rmtree(os.path.join(ARTIFACT_DIR, dir_name), ignore_errors=True)
263+
if rc != 0:
264+
return rc, "fail to execute command '" + cmd + "': " + err
265+
266+
if use_persistent_storage:
267+
DebugInfo._save_persistent_storage(artifact_name)
268+
269+
return 0, host_artifact_name
270+
271+
@host_service.method(
272+
host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is")
273+
def collect(self, options):
274+
"""DBus entrypoint to collect debug artifacts from host"""
275+
# Converts single string input into a one-element list.
276+
if isinstance(options, str):
277+
options = [options]
278+
try:
279+
json.loads(options[0])
280+
except json.JSONDecodeError:
281+
return 1, "invalid input: " + options[0]
282+
283+
if not self._board_type:
284+
self._board_type = self.get_board_type()
285+
if self._hostname == "switch":
286+
self._hostname = self.get_hostname()
287+
288+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
289+
try:
290+
rc, artifact_path = self.collect_artifacts(options[0], timestamp, self._board_type, self._hostname)
291+
except Exception as error:
292+
return 1, "Artifact collection failed: " + str(
293+
error)
294+
if rc != 0:
295+
return rc, artifact_path
296+
return 0, artifact_path
297+
298+
@host_service.method(
299+
host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is")
300+
def check(self, options):
301+
"""Always ready because artifact collection is synchronous."""
302+
return 0, "Artifact ready"
303+
304+
@host_service.method(
305+
host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is")
306+
def ack(self, options):
307+
# The artifact name in container has a different prefix. Convert it to the
308+
# host.
309+
if isinstance(options, str):
310+
options = [options]
311+
artifact = ARTIFACT_DIR + options[0].removeprefix(ARTIFACT_DIR)
312+
try:
313+
os.remove(artifact)
314+
except FileNotFoundError:
315+
return 1, "Artifact file not found: " + str(artifact)
316+
except PermissionError:
317+
return 1, "Artifact file permission denied: " + str(artifact)
318+
except OSError as error:
319+
return 1, "Failed to delete artifact file with error: " + str(error)
320+
return 0, ""
321+
322+
def register():
323+
"""Return class name."""
324+
return DebugInfo, MOD_NAME

scripts/sonic-host-server

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,7 @@ import dbus.service
1212
import dbus.mainloop.glib
1313

1414
from gi.repository import GObject
15-
from host_modules import (
16-
config_engine,
17-
gcu,
18-
host_service,
19-
showtech,
20-
systemd_service,
21-
file_service,
22-
image_service,
23-
docker_service,
24-
reboot,
25-
debug_service,
26-
gnoi_reset
27-
)
28-
15+
from host_modules import config_engine, debug_info, debug_service, docker_service, file_service, gcu, gnoi_reset, host_service, image_service, reboot, showtech, systemd_service
2916

3017
def register_dbus():
3118
"""Register DBus handlers for individual modules"""
@@ -40,7 +27,8 @@ def register_dbus():
4027
'docker_service': docker_service.DockerService('docker_service'),
4128
'file_stat': file_service.FileService('file'),
4229
'debug_service': debug_service.DebugExecutor('DebugExecutor'),
43-
'gnoi_reset': gnoi_reset.GnoiReset('gnoi_reset')
30+
'gnoi_reset': gnoi_reset.GnoiReset('gnoi_reset'),
31+
'debug_info': debug_info.DebugInfo('debug_info')
4432
}
4533
for mod_name, handler_class in mod_dict.items():
4634
handlers[mod_name] = handler_class

0 commit comments

Comments
 (0)