Skip to content

Commit 4a96aec

Browse files
committed
workflows: crash: skip false positives from pre-boot crashes
The crash watchdog was reporting old crashes that occurred before the host's last reboot as current issues. Add logic to check if crash files are older than the host's boot time and skip them if they are. Also automatically clean up these stale crash files. This prevents the watchdog from falsely reporting hosts as crashed when they have already recovered from previous incidents. Generated-by: Claude AI Signed-off-by: Luis Chamberlain <[email protected]>
1 parent 141d111 commit 4a96aec

File tree

1 file changed

+54
-0
lines changed

1 file changed

+54
-0
lines changed

scripts/workflows/lib/crash.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,29 @@ def get_qr_ascii(self, content, invert=True):
345345

346346
return buffer.getvalue()
347347

348+
def clean_old_crash_files(self, host_boot_time):
349+
"""Remove crash files that are older than the host's last boot time."""
350+
if not os.path.exists(self.output_dir) or not host_boot_time:
351+
return
352+
353+
file_patterns = [
354+
"journal-*.crash",
355+
"journal-*.corruption",
356+
"journal-*.crash_and_corruption",
357+
"journal-*.warning",
358+
"journal-*.decoded.*",
359+
]
360+
361+
for pattern in file_patterns:
362+
for file_path in glob.glob(os.path.join(self.output_dir, pattern)):
363+
try:
364+
mtime = os.path.getmtime(file_path)
365+
if mtime < host_boot_time:
366+
os.remove(file_path)
367+
logger.info(f"Removed old crash file: {file_path}")
368+
except Exception as e:
369+
logger.warning(f"Failed to remove old crash file {file_path}: {e}")
370+
348371
def load_known_crashes(self):
349372
"""Load previously detected log hashes from the output directory."""
350373
if not os.path.exists(self.output_dir):
@@ -869,6 +892,37 @@ def check_and_reset_host(self, method="auto", get_fstests_log=None):
869892
warnings_file = None
870893
journal_logs = None
871894

895+
# Check if host is up and get its boot time to filter old crashes
896+
host_boot_time = None
897+
if self.check_host_reachable():
898+
try:
899+
result = subprocess.run(
900+
["ssh", self.host_name, "awk '/^btime/ {print $2}' /proc/stat"],
901+
capture_output=True,
902+
text=True,
903+
timeout=10,
904+
)
905+
if result.returncode == 0 and result.stdout.strip():
906+
host_boot_time = int(result.stdout.strip())
907+
logger.debug(f"Host {self.host_name} boot time: {host_boot_time}")
908+
909+
# Check if latest crash file is older than boot time
910+
if self.latest_file_with_issue and os.path.exists(
911+
self.latest_file_with_issue
912+
):
913+
crash_mtime = os.path.getmtime(self.latest_file_with_issue)
914+
if crash_mtime < host_boot_time:
915+
logger.info(
916+
f"Latest crash file for {self.host_name} is from before last boot "
917+
f"(crash: {datetime.fromtimestamp(crash_mtime)}, "
918+
f"boot: {datetime.fromtimestamp(host_boot_time)}), skipping"
919+
)
920+
# Clean up old crash files from before the boot
921+
self.clean_old_crash_files(host_boot_time)
922+
return None, None
923+
except Exception as e:
924+
logger.debug(f"Could not get boot time for {self.host_name}: {e}")
925+
872926
# 1. Try console log first if guestfs is enabled
873927
if method == "console" or (method == "auto" and self.kdevops_enable_guestfs):
874928
logger.debug(f"Trying console.log fallback for {self.host_name}")

0 commit comments

Comments
 (0)