|
| 1 | +"""Debug Info. |
| 2 | +
|
| 3 | +This host service module implements the backend support for |
| 4 | +collecting host debug artifacts.Depending on the log level |
| 5 | +and board type input,the relevant log files, DB snapshots, |
| 6 | +counters, record files,and various command outputs are collected |
| 7 | +and aggregated under a specified artifact directory and the directory is |
| 8 | +compressed to a *.tar.gz in the host. |
| 9 | +
|
| 10 | +As part of the SONiC supported common debug commands,below are the list of files. |
| 11 | +core,log,db,counter files,routing.txt and version.txt |
| 12 | +""" |
| 13 | + |
| 14 | +from datetime import datetime |
| 15 | +import json |
| 16 | +import logging |
| 17 | +import os |
| 18 | +import shutil |
| 19 | +import subprocess |
| 20 | +import time |
| 21 | + |
| 22 | +from host_modules import host_service |
| 23 | +# Import SONiC debug commands for SONiC platform. |
| 24 | +from utils.sonic_debug_cmds import * |
| 25 | + |
| 26 | +MOD_NAME = "debug_info" |
| 27 | +ARTIFACT_DIR = "/tmp/dump" |
| 28 | +NONVOLATILE_PARTITION = "/var/log/" |
| 29 | +NONVOLATILE_ARTIFACT_DIR = "/var/log/dump" |
| 30 | +NONVOLATILE_STORAGE_REQUIRED = 5 * 10**8 |
| 31 | +NONVOLATILE_TMP_FLAG = "/tmp/nonvolatile_saved" |
| 32 | +ARTIFACT_DIR_CONTAINER = "/var/dump" |
| 33 | +ARTIFACT_DIR_HOST = "host" |
| 34 | +CORE_DIR = "core" |
| 35 | +DB_ARTIFACT_DIR = ARTIFACT_DIR_HOST + "/db" |
| 36 | +ARTIFACT_LEVEL_ALERT = "alert" |
| 37 | +ARTIFACT_LEVEL_CRITICAL = "critical" |
| 38 | +ARTIFACT_LEVEL_ALL = "all" |
| 39 | +LOG_LEVEL_KEY = "level" |
| 40 | +PERSISTENT_STORAGE_KEY = "use_persistent_storage" |
| 41 | + |
| 42 | +STATE_DB_SEPARATOR = "|" |
| 43 | +DEBUG_INFO_FLAG = "debug_info" |
| 44 | + |
| 45 | +log_dir = "/var/log" |
| 46 | +os.makedirs(log_dir, exist_ok=True) |
| 47 | + |
| 48 | +log_file = os.path.join(log_dir, "debug_info.log") |
| 49 | +logging.basicConfig( |
| 50 | + filename=log_file, |
| 51 | + filemode='a', # append mode |
| 52 | + format='%(asctime)s - %(levelname)s - %(message)s', |
| 53 | + level=logging.DEBUG |
| 54 | +) |
| 55 | + |
| 56 | +logger = logging.getLogger(__name__) |
| 57 | + |
| 58 | +class DebugInfo(host_service.HostModule): |
| 59 | + """DBus endpoint that collects debug artifacts.""" |
| 60 | + |
| 61 | + def __init__(self, mod_name): |
| 62 | + self._board_type = DebugInfo.get_board_type() |
| 63 | + self._hostname = DebugInfo.get_hostname() |
| 64 | + super(DebugInfo, self).__init__(mod_name) |
| 65 | + |
| 66 | + @staticmethod |
| 67 | + def _run_command(cmd: str, timeout: int = 20): |
| 68 | + proc = subprocess.Popen( |
| 69 | + cmd, |
| 70 | + shell=True, |
| 71 | + text=True, |
| 72 | + stdout=subprocess.PIPE, |
| 73 | + stderr=subprocess.PIPE, |
| 74 | + close_fds=True) |
| 75 | + try: |
| 76 | + stdout, stderr = proc.communicate(timeout=timeout) |
| 77 | + except subprocess.TimeoutExpired: |
| 78 | + proc.kill() |
| 79 | + return 1, "command timeout", "command timeout" |
| 80 | + return proc.returncode, stdout, stderr |
| 81 | + |
| 82 | + @staticmethod |
| 83 | + def get_board_type() -> str: |
| 84 | + rc, stdout, err = DebugInfo._run_command(BOARD_TYPE_CMD) |
| 85 | + board_type = "" |
| 86 | + if rc != 0: |
| 87 | + logger.warning("fail to execute command '%s': %s", BOARD_TYPE_CMD, err) |
| 88 | + else: |
| 89 | + board_type = stdout.strip() |
| 90 | + return board_type |
| 91 | + |
| 92 | + @staticmethod |
| 93 | + def get_hostname() -> str: |
| 94 | + cmd = "hostname" |
| 95 | + rc, stdout, err = DebugInfo._run_command(cmd) |
| 96 | + hostname = "switch" |
| 97 | + if rc != 0: |
| 98 | + logger.warning("fail to execute command '%s': %s", cmd, err) |
| 99 | + else: |
| 100 | + hostname = stdout.strip() |
| 101 | + return hostname |
| 102 | + |
| 103 | + @staticmethod |
| 104 | + def _collect_counter_artifacts(directory: str, prefix: str, |
| 105 | + board_type: str) -> None: |
| 106 | + counter_artifact_dir = os.path.join( |
| 107 | + directory, |
| 108 | + datetime.now().strftime(prefix + "counter_%Y%m%d_%H%M%S")) |
| 109 | + os.makedirs(counter_artifact_dir, exist_ok=True) |
| 110 | + |
| 111 | + for cmd in COUNTER_CMDS: |
| 112 | + rc, _, err = DebugInfo._run_command(cmd.format(counter_artifact_dir), timeout=60) |
| 113 | + if rc != 0: |
| 114 | + # Continue the artifact collection in case of error. |
| 115 | + logger.warning("fail to execute command '%s': %s", cmd, err) |
| 116 | + |
| 117 | + @staticmethod |
| 118 | + def _collect_teamdctl_data(artifact_dir_host): |
| 119 | + try: |
| 120 | + redis_result = subprocess.run( |
| 121 | + REDIS_LIST_PORTCHANNEL_CMD, shell=True, capture_output=True, text=True, check=True) |
| 122 | + trunks = redis_result.stdout.strip().split('\n') |
| 123 | + for trunk in trunks: |
| 124 | + try: |
| 125 | + trk = trunk.split('|')[1] |
| 126 | + except IndexError: |
| 127 | + # No trunk is found in the DB or the trunk table format is incorrect. |
| 128 | + continue |
| 129 | + teamdctl_cmd = TEAMD_CTL_CMD.format(trk) |
| 130 | + teamdctl_result = subprocess.run( |
| 131 | + teamdctl_cmd, shell=True, capture_output=True, text=True) |
| 132 | + if teamdctl_result.returncode == 0: |
| 133 | + filepath = os.path.join(artifact_dir_host, f'teamdctl_{trk}.txt') |
| 134 | + try: |
| 135 | + with open(filepath, 'w') as f: |
| 136 | + f.write(teamdctl_result.stdout) |
| 137 | + # If the filepath is invalid, then just return silently. If the |
| 138 | + # filepath is valid, the file will be created. |
| 139 | + except FileNotFoundError: |
| 140 | + return |
| 141 | + else: |
| 142 | + logger.warning( |
| 143 | + f"Error running teamdctl for {trk}: {teamdctl_result.stderr}") |
| 144 | + except subprocess.CalledProcessError as e: |
| 145 | + logger.warning(f"Error running Redis command: {e}") |
| 146 | + |
| 147 | + @staticmethod |
| 148 | + def _save_persistent_storage(artifact_name: str) -> None: |
| 149 | + if os.path.isfile(NONVOLATILE_TMP_FLAG): |
| 150 | + logger.warning( |
| 151 | + "%s already exists, skipping saving artifacts to " |
| 152 | + "persistent storage", NONVOLATILE_TMP_FLAG) |
| 153 | + return |
| 154 | + try: |
| 155 | + with open(NONVOLATILE_TMP_FLAG, "w+"): |
| 156 | + pass |
| 157 | + except OSError as e: |
| 158 | + logger.warning("error creating flag in tmp: %s. Error: %s", |
| 159 | + NONVOLATILE_TMP_FLAG, str(e)) |
| 160 | + return |
| 161 | + |
| 162 | + host_artifact_name = ARTIFACT_DIR + "/" + artifact_name |
| 163 | + shutil.rmtree(NONVOLATILE_ARTIFACT_DIR, ignore_errors=True) |
| 164 | + try: |
| 165 | + artifact_size = os.path.getsize(host_artifact_name) |
| 166 | + except OSError: |
| 167 | + logger.warning("path %s did not exist", host_artifact_name) |
| 168 | + return |
| 169 | + |
| 170 | + _, _, free = shutil.disk_usage(NONVOLATILE_PARTITION) |
| 171 | + if free < NONVOLATILE_STORAGE_REQUIRED + artifact_size: |
| 172 | + logger.warning( |
| 173 | + "free space remaining on %s is less than %d: %d. Not saving " |
| 174 | + "artifacts to persistent storage", NONVOLATILE_PARTITION, |
| 175 | + NONVOLATILE_STORAGE_REQUIRED + artifact_size, free) |
| 176 | + return |
| 177 | + |
| 178 | + os.makedirs(NONVOLATILE_ARTIFACT_DIR, exist_ok=True) |
| 179 | + |
| 180 | + cmd = ( |
| 181 | + f"cp {host_artifact_name} {NONVOLATILE_ARTIFACT_DIR}/{artifact_name}") |
| 182 | + |
| 183 | + rc, _, err = DebugInfo._run_command(cmd) |
| 184 | + if rc != 0: |
| 185 | + # Report success overall if saving to persistent storage fails, saving |
| 186 | + # to persistent storage is best-effort. |
| 187 | + logger.warning("fail to execute command '%s': %s", cmd, err) |
| 188 | + |
| 189 | + @staticmethod |
| 190 | + def collect_artifacts(req: str, timestamp: str, board_type: str, |
| 191 | + hostname: str): |
| 192 | + """Collect all artifacts for a given board type. |
| 193 | +
|
| 194 | + Currently only host-level and DB artifcats are collected. |
| 195 | + Component-level (e.g., gnmi/orch) artifact collection is not supported |
| 196 | +
|
| 197 | + This method can also be called by the CLI. |
| 198 | +
|
| 199 | + Args: |
| 200 | + req: = string, a single JSON string that contains the log level, |
| 201 | + and optional with persistent_storage flag to indicate if the artifacst should |
| 202 | + be stored in persistent storage, in addition to volatile storage |
| 203 | + timestamp: = string, a timestamp string that is used in the artifact name. |
| 204 | + board_type: = string, a string representation of the board type. |
| 205 | + hostname: = string, the hostname of the device, used to name the output |
| 206 | + directory. |
| 207 | +
|
| 208 | + Returns: |
| 209 | + string: a return code and a return string to indicate the output artifact |
| 210 | + in the host. |
| 211 | + """ |
| 212 | + try: |
| 213 | + request = json.loads(req) |
| 214 | + except json.JSONDecodeError: |
| 215 | + return 1, "invalid input: " + req |
| 216 | + log_level = request.get(LOG_LEVEL_KEY, ARTIFACT_LEVEL_ALERT) |
| 217 | + use_persistent_storage = request.get( |
| 218 | + PERSISTENT_STORAGE_KEY) if PERSISTENT_STORAGE_KEY in request else False |
| 219 | + |
| 220 | + dir_name = hostname + "_" + timestamp |
| 221 | + artifact_dir_host = os.path.join(ARTIFACT_DIR, dir_name, ARTIFACT_DIR_HOST) |
| 222 | + db_artifact_dir = os.path.join(ARTIFACT_DIR, dir_name, DB_ARTIFACT_DIR) |
| 223 | + |
| 224 | + os.makedirs(artifact_dir_host, exist_ok=True) |
| 225 | + |
| 226 | + # Collect counter artifacts at the beginning of the collection. |
| 227 | + if log_level == ARTIFACT_LEVEL_CRITICAL or log_level == ARTIFACT_LEVEL_ALL: |
| 228 | + DebugInfo._collect_counter_artifacts(artifact_dir_host, "pre_", |
| 229 | + board_type) |
| 230 | + |
| 231 | + for cmd in COMMON_CMDS: |
| 232 | + rc, _, err = DebugInfo._run_command(cmd.format(artifact_dir_host)) |
| 233 | + if rc != 0: |
| 234 | + # Continue the artifact collection in case of error. |
| 235 | + logger.warning("fail to execute command '%s': %s", cmd, err) |
| 236 | + |
| 237 | + # create host/core dir if it does not exist |
| 238 | + os.makedirs(artifact_dir_host + "/" + CORE_DIR, exist_ok=True) |
| 239 | + |
| 240 | + DebugInfo._collect_teamdctl_data(artifact_dir_host) |
| 241 | + |
| 242 | + if log_level == ARTIFACT_LEVEL_CRITICAL or log_level == ARTIFACT_LEVEL_ALL: |
| 243 | + os.makedirs(db_artifact_dir, exist_ok=True) |
| 244 | + for cmd in DB_CMDS: |
| 245 | + rc, _, err = DebugInfo._run_command(cmd.format(db_artifact_dir), timeout=60) |
| 246 | + if rc != 0: |
| 247 | + # Continue the artifact collection in case of error. |
| 248 | + logger.warning("fail to execute command '%s': %s", cmd, err) |
| 249 | + |
| 250 | + # Collect counter artifacts at the end of the collection. |
| 251 | + if log_level == ARTIFACT_LEVEL_CRITICAL or log_level == ARTIFACT_LEVEL_ALL: |
| 252 | + DebugInfo._collect_counter_artifacts(artifact_dir_host, "post_", |
| 253 | + board_type) |
| 254 | + |
| 255 | + artifact_name = dir_name + ".tar.gz" |
| 256 | + host_artifact_name = ARTIFACT_DIR + "/" + artifact_name |
| 257 | + |
| 258 | + cmd = ("tar -C " + ARTIFACT_DIR + " -zcvf " + host_artifact_name + " " + |
| 259 | + dir_name) |
| 260 | + |
| 261 | + rc, _, err = DebugInfo._run_command(cmd, timeout=60) |
| 262 | + shutil.rmtree(os.path.join(ARTIFACT_DIR, dir_name), ignore_errors=True) |
| 263 | + if rc != 0: |
| 264 | + return rc, "fail to execute command '" + cmd + "': " + err |
| 265 | + |
| 266 | + if use_persistent_storage: |
| 267 | + DebugInfo._save_persistent_storage(artifact_name) |
| 268 | + |
| 269 | + return 0, host_artifact_name |
| 270 | + |
| 271 | + @host_service.method( |
| 272 | + host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is") |
| 273 | + def collect(self, options): |
| 274 | + """DBus entrypoint to collect debug artifacts from host""" |
| 275 | + # Converts single string input into a one-element list. |
| 276 | + if isinstance(options, str): |
| 277 | + options = [options] |
| 278 | + try: |
| 279 | + json.loads(options[0]) |
| 280 | + except json.JSONDecodeError: |
| 281 | + return 1, "invalid input: " + options[0] |
| 282 | + |
| 283 | + if not self._board_type: |
| 284 | + self._board_type = self.get_board_type() |
| 285 | + if self._hostname == "switch": |
| 286 | + self._hostname = self.get_hostname() |
| 287 | + |
| 288 | + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f") |
| 289 | + try: |
| 290 | + rc, artifact_path = self.collect_artifacts(options[0], timestamp, self._board_type, self._hostname) |
| 291 | + except Exception as error: |
| 292 | + return 1, "Artifact collection failed: " + str( |
| 293 | + error) |
| 294 | + if rc != 0: |
| 295 | + return rc, artifact_path |
| 296 | + return 0, artifact_path |
| 297 | + |
| 298 | + @host_service.method( |
| 299 | + host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is") |
| 300 | + def check(self, options): |
| 301 | + """Always ready because artifact collection is synchronous.""" |
| 302 | + return 0, "Artifact ready" |
| 303 | + |
| 304 | + @host_service.method( |
| 305 | + host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is") |
| 306 | + def ack(self, options): |
| 307 | + # The artifact name in container has a different prefix. Convert it to the |
| 308 | + # host. |
| 309 | + if isinstance(options, str): |
| 310 | + options = [options] |
| 311 | + artifact = ARTIFACT_DIR + options[0].removeprefix(ARTIFACT_DIR) |
| 312 | + try: |
| 313 | + os.remove(artifact) |
| 314 | + except FileNotFoundError: |
| 315 | + return 1, "Artifact file not found: " + str(artifact) |
| 316 | + except PermissionError: |
| 317 | + return 1, "Artifact file permission denied: " + str(artifact) |
| 318 | + except OSError as error: |
| 319 | + return 1, "Failed to delete artifact file with error: " + str(error) |
| 320 | + return 0, "" |
| 321 | + |
| 322 | +def register(): |
| 323 | + """Return class name.""" |
| 324 | + return DebugInfo, MOD_NAME |
0 commit comments