Skip to content

Commit 72f8543

Browse files
committed
Additional debugging
Signed-off-by: Julius Koskela <julius.koskela@unikie.com> Signed-off-by: Julius Koskela (Digimuoto Oy) <julius.koskela@digimuoto.com>
1 parent ca55e1f commit 72f8543

File tree

4 files changed

+267
-3
lines changed

4 files changed

+267
-3
lines changed

modules/common/logging/common.nix

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,12 @@ let
7070
7171
systemd-tmpfiles --create --prefix /var/log/journal
7272
systemctl restart systemd-journald.service
73-
systemctl restart alloy.service
73+
74+
if systemctl cat alloy.service >/dev/null 2>&1; then
75+
systemctl restart alloy.service
76+
else
77+
echo "alloy.service not installed, skipping restart"
78+
fi
7479
'';
7580
};
7681
in
@@ -144,7 +149,9 @@ in
144149
};
145150

146151
recovery = {
147-
enable = mkEnableOption "journald/alloy recovery after realtime clock jumps";
152+
enable = (mkEnableOption "journald/alloy recovery after realtime clock jumps") // {
153+
default = true;
154+
};
148155

149156
thresholdSeconds = mkOption {
150157
description = "Only act on clock jumps >= this many seconds.";

modules/common/logging/fss-verify-classifier.sh

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-FileCopyrightText: 2022-2026 TII (SSRC) and the Ghaf contributors
33
# SPDX-License-Identifier: Apache-2.0
44
# shellcheck shell=bash
5-
# shellcheck disable=SC2034
5+
# shellcheck disable=SC2034,SC2329
66

77
# Shared helpers for classifying `journalctl --verify` output.
88

@@ -77,6 +77,67 @@ fss_append_line() {
7777
fi
7878
}
7979

80+
# shellcheck disable=SC2329
81+
fss_count_nonempty_lines() {
82+
local text="$1"
83+
local line
84+
local count=0
85+
86+
while IFS= read -r line || [ -n "$line" ]; do
87+
if [ -n "$line" ]; then
88+
count=$((count + 1))
89+
fi
90+
done <<<"$text"
91+
92+
printf '%s' "$count"
93+
}
94+
95+
# shellcheck disable=SC2329
96+
fss_failure_bucket_for_path() {
97+
local failure_path="$1"
98+
99+
case "$failure_path" in
100+
*.journal~)
101+
printf '%s' "temp"
102+
;;
103+
*/system.journal)
104+
printf '%s' "active-system"
105+
;;
106+
*/system@*.journal)
107+
printf '%s' "archived-system"
108+
;;
109+
*/user-[0-9]*.journal)
110+
printf '%s' "user-journal"
111+
;;
112+
*)
113+
printf '%s' "other"
114+
;;
115+
esac
116+
}
117+
118+
# shellcheck disable=SC2329
119+
fss_unique_fail_paths_from_output() {
120+
local output="$1"
121+
local line
122+
local failure_path
123+
local unique_paths=""
124+
125+
while IFS= read -r line || [ -n "$line" ]; do
126+
case "$line" in
127+
FAIL:\ *)
128+
failure_path="${line#FAIL: }"
129+
failure_path="${failure_path%% *}"
130+
131+
if [ -n "$failure_path" ] && ! printf '%s\n' "$unique_paths" | grep -Fxq "$failure_path"; then
132+
unique_paths=$(fss_append_line "$unique_paths" "$failure_path")
133+
fi
134+
;;
135+
esac
136+
done <<<"$output"
137+
138+
printf '%s' "$unique_paths"
139+
}
140+
80141
fss_reset_classification() {
81142
FSS_REASON_TAGS=""
82143
FSS_FAIL_LINES=""

tests/logging/test_scripts/fss-test.nix

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,161 @@ writeShellApplication {
6262
fail() { fss_log_fail "$1"; FAILED=$((FAILED + 1)); }
6363
warn() { fss_log_warn "$1"; WARNED=$((WARNED + 1)); }
6464
info() { fss_log_info "$1"; }
65+
count_matches() {
66+
local pattern="$1"
67+
local text="$2"
68+
69+
printf '%s\n' "$text" | grep -Eic "$pattern" || true
70+
}
71+
print_prefixed_lines() {
72+
local prefix="$1"
73+
local text="$2"
74+
local limit="''${3:-0}"
75+
local line
76+
local total=0
77+
local shown=0
78+
79+
total=$(fss_count_nonempty_lines "$text")
80+
if [ "$total" -eq 0 ]; then
81+
return 0
82+
fi
83+
84+
while IFS= read -r line || [ -n "$line" ]; do
85+
if [ -z "$line" ]; then
86+
continue
87+
fi
88+
89+
shown=$((shown + 1))
90+
if [ "$limit" -gt 0 ] && [ "$shown" -gt "$limit" ]; then
91+
break
92+
fi
93+
94+
printf '%s%s\n' "$prefix" "$line"
95+
done <<<"$text"
96+
97+
if [ "$limit" -gt 0 ] && [ "$total" -gt "$limit" ]; then
98+
printf '%s... (%s more lines)\n' "$prefix" "$((total - limit))"
99+
fi
100+
}
101+
summarize_header_output() {
102+
local header_output="$1"
103+
104+
printf '%s\n' "$header_output" | grep -E '^(State:|Compatible flags:|Incompatible flags:|Boot ID:|Machine ID:|Head realtime timestamp:|Tail realtime timestamp:|Tail monotonic timestamp:|Rotate suggested:|Disk usage:)' || true
105+
}
106+
summarize_file_verify_output() {
107+
local verify_output="$1"
108+
109+
printf '%s\n' "$verify_output" | grep -E '(^[[:xdigit:]]+: )|(^File corruption detected)|(^FAIL: )|(^PASS: )|(^=> Validated)|(^Journal file .* is truncated, ignoring file\.)|(Required key not available)|(parse.*seed)|(tag/entry realtime timestamp out of synchronization)' || true
110+
}
111+
print_recent_journald_alerts() {
112+
local alerts
113+
114+
alerts=$(journalctl -u systemd-journald --no-pager 2>/dev/null | grep -Ei 'corrupt|unclean|renaming and replacing|failed to append tag when closing journal|time jumped backwards|rotating|recover' | tail -n 12 || true)
115+
if [ -n "$alerts" ]; then
116+
echo " Recent systemd-journald alerts:"
117+
print_prefixed_lines " " "$alerts"
118+
fi
119+
}
120+
print_failed_file_diagnostics() {
121+
local verify_output="$1"
122+
local verify_key="$2"
123+
local failed_paths
124+
local failed_count=0
125+
local path
126+
local bucket
127+
local stat_output
128+
local header_output
129+
local header_summary
130+
local file_verify_output
131+
local file_verify_summary
132+
local path_limit=4
133+
local total_failed_paths
134+
135+
failed_paths=$(fss_unique_fail_paths_from_output "$verify_output")
136+
if [ -z "$failed_paths" ]; then
137+
return 0
138+
fi
139+
total_failed_paths=$(fss_count_nonempty_lines "$failed_paths")
140+
141+
echo " Detailed file diagnostics (up to $path_limit unique failing files):"
142+
while IFS= read -r path || [ -n "$path" ]; do
143+
if [ -z "$path" ]; then
144+
continue
145+
fi
146+
147+
failed_count=$((failed_count + 1))
148+
if [ "$failed_count" -gt "$path_limit" ]; then
149+
printf ' ... %s additional failing file(s) omitted\n' "$((total_failed_paths - path_limit))"
150+
break
151+
fi
152+
153+
bucket=$(fss_failure_bucket_for_path "$path")
154+
printf ' - %s [%s]\n' "$path" "$bucket"
155+
156+
if [ ! -e "$path" ]; then
157+
echo " file no longer exists"
158+
continue
159+
fi
160+
161+
stat_output=$(stat -c 'size=%s mode=%a uid=%u gid=%g mtime=%y' "$path" 2>/dev/null || true)
162+
if [ -n "$stat_output" ]; then
163+
echo " stat: $stat_output"
164+
fi
165+
166+
header_output=$(journalctl --header --file="$path" 2>/dev/null || true)
167+
header_summary=$(summarize_header_output "$header_output")
168+
if [ -n "$header_summary" ]; then
169+
print_prefixed_lines " header: " "$header_summary" 8
170+
fi
171+
172+
if [ -n "$verify_key" ]; then
173+
file_verify_output=$(journalctl --verify --verify-key="$verify_key" --file="$path" 2>&1 || true)
174+
file_verify_summary=$(summarize_file_verify_output "$file_verify_output")
175+
if [ -n "$file_verify_summary" ]; then
176+
print_prefixed_lines " verify: " "$file_verify_summary" 8
177+
fi
178+
fi
179+
done <<<"$failed_paths"
180+
}
181+
print_verify_diagnostics() {
182+
local verify_output="$1"
183+
local verify_tags="$2"
184+
local verify_exit="$3"
185+
local verify_key="''${4:-}"
186+
local active_count
187+
local archived_count
188+
local user_count
189+
local temp_count
190+
local other_count
191+
local tag_failed_count
192+
local epoch_count
193+
local time_sync_count
194+
local corruption_count
195+
local io_error_count
196+
local fail_path_count
197+
198+
active_count=$(fss_count_nonempty_lines "$FSS_ACTIVE_SYSTEM_FAILURES")
199+
archived_count=$(fss_count_nonempty_lines "$FSS_ARCHIVED_SYSTEM_FAILURES")
200+
user_count=$(fss_count_nonempty_lines "$FSS_USER_FAILURES")
201+
temp_count=$(fss_count_nonempty_lines "$FSS_TEMP_FAILURES")
202+
other_count=$(fss_count_nonempty_lines "$FSS_OTHER_FAILURES")
203+
fail_path_count=$(fss_count_nonempty_lines "$(fss_unique_fail_paths_from_output "$verify_output")")
204+
205+
tag_failed_count=$(count_matches 'Tag failed verification' "$verify_output")
206+
epoch_count=$(count_matches 'Epoch sequence not continuous' "$verify_output")
207+
time_sync_count=$(count_matches 'tag/entry realtime timestamp out of synchronization' "$verify_output")
208+
corruption_count=$(count_matches 'File corruption detected at ' "$verify_output")
209+
io_error_count=$(count_matches 'Input/output error|I/O error' "$verify_output")
210+
211+
printf ' Diagnostic summary: exit=%s tags=[%s] failing_files=%s active=%s archived=%s user=%s temp=%s other=%s\n' \
212+
"$verify_exit" "$verify_tags" "$fail_path_count" "$active_count" "$archived_count" "$user_count" "$temp_count" "$other_count"
213+
printf ' Signals: tag_failed=%s epoch_discontinuity=%s time_sync=%s file_corruption=%s io_error=%s key_parse=%s key_missing=%s filesystem_restriction=%s\n' \
214+
"$tag_failed_count" "$epoch_count" "$time_sync_count" "$corruption_count" "$io_error_count" \
215+
"$FSS_KEY_PARSE_ERROR" "$FSS_KEY_REQUIRED_ERROR" "$FSS_FILESYSTEM_RESTRICTION"
216+
217+
print_failed_file_diagnostics "$verify_output" "$verify_key"
218+
print_recent_journald_alerts
219+
}
65220
66221
fss_log_block <<'EOF'
67222
==========================================
@@ -209,26 +364,33 @@ writeShellApplication {
209364
if [ "$FSS_KEY_PARSE_ERROR" -eq 1 ] || [ "$FSS_KEY_REQUIRED_ERROR" -eq 1 ]; then
210365
fail "Journal verification failed due to verification key defect [$VERIFY_TAGS]"
211366
echo " Output: $VERIFY_OUTPUT"
367+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT"
212368
elif [ -n "$FSS_ACTIVE_SYSTEM_FAILURES" ]; then
213369
fail "Active system journal verification failed [$VERIFY_TAGS]"
214370
echo " Output: $VERIFY_OUTPUT"
371+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT" "$VERIFY_KEY"
215372
elif [ -n "$FSS_OTHER_FAILURES" ]; then
216373
fail "Journal verification found unclassified critical failures [$VERIFY_TAGS]"
217374
echo " Output: $VERIFY_OUTPUT"
375+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT" "$VERIFY_KEY"
218376
elif [ -n "$FSS_ARCHIVED_SYSTEM_FAILURES" ] || [ -n "$FSS_USER_FAILURES" ]; then
219377
warn "Journal verification passed with archive/user warnings [$VERIFY_TAGS]"
220378
echo " Output: $VERIFY_OUTPUT"
379+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT" "$VERIFY_KEY"
221380
elif [ -n "$FSS_TEMP_FAILURES" ]; then
222381
pass "Journal verification passed (temporary journal files ignored)"
223382
echo " Ignored temp failures: $FSS_TEMP_FAILURES"
383+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT" "$VERIFY_KEY"
224384
elif [ "$VERIFY_EXIT" -eq 0 ]; then
225385
pass "Journal verification passed"
226386
elif [ "$FSS_FILESYSTEM_RESTRICTION" -eq 1 ]; then
227387
warn "Verification encountered filesystem restrictions [$VERIFY_TAGS]"
228388
echo " Output: $VERIFY_OUTPUT"
389+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT"
229390
else
230391
warn "Verification returned exit code $VERIFY_EXIT without critical failures [$VERIFY_TAGS]"
231392
echo " Output: $VERIFY_OUTPUT"
393+
print_verify_diagnostics "$VERIFY_OUTPUT" "$VERIFY_TAGS" "$VERIFY_EXIT" "$VERIFY_KEY"
232394
fi
233395
fi
234396

tests/logging/test_scripts/fss_verification.nix

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,22 @@ _: ''
126126
[ -n "$FSS_OTHER_FAILURES" ]
127127
[ -z "$FSS_TEMP_FAILURES" ]
128128
129+
mixed_sample=$(printf "%s\n" \
130+
"FAIL: /var/log/journal/mid/system.journal (Bad message)" \
131+
"FAIL: /var/log/journal/mid/system.journal (Bad message)" \
132+
"FAIL: /var/log/journal/mid/system@0000000000000001-0000000000000002.journal (Bad message)" \
133+
"FAIL: /var/log/journal/mid/user-1000@0000000000000001-0000000000000002.journal (Bad message)" \
134+
"FAIL: /var/log/journal/mid/system@0000000000000001-0000000000000002.journal~ (Bad message)" \
135+
"FAIL: /var/log/journal/mid/custom.journal (Bad message)")
136+
unique_fail_paths=$(fss_unique_fail_paths_from_output "$mixed_sample")
137+
[ "$(fss_count_nonempty_lines "$unique_fail_paths")" -eq 5 ]
138+
[ "$(printf "%s\n" "$unique_fail_paths" | grep -c '^/var/log/journal/mid/system.journal$')" -eq 1 ]
139+
[ "$(fss_failure_bucket_for_path "/var/log/journal/mid/system.journal")" = "active-system" ]
140+
[ "$(fss_failure_bucket_for_path "/var/log/journal/mid/system@0000000000000001-0000000000000002.journal")" = "archived-system" ]
141+
[ "$(fss_failure_bucket_for_path "/var/log/journal/mid/user-1000@0000000000000001-0000000000000002.journal")" = "user-journal" ]
142+
[ "$(fss_failure_bucket_for_path "/var/log/journal/mid/system@0000000000000001-0000000000000002.journal~")" = "temp" ]
143+
[ "$(fss_failure_bucket_for_path "/var/log/journal/mid/custom.journal")" = "other" ]
144+
129145
key_sample=$(printf "%s\n" \
130146
"Failed to parse seed." \
131147
"FAIL: /var/log/journal/mid/system.journal (Required key not available)")
@@ -167,6 +183,24 @@ _: ''
167183
'
168184
""")
169185
186+
with subtest("Clock-jump recovery defaults are enabled"):
187+
machine.succeed("systemctl list-unit-files ghaf-clock-jump-watcher.service")
188+
machine.succeed("systemctl list-unit-files ghaf-journal-alloy-recover.service")
189+
machine.wait_for_unit("ghaf-clock-jump-watcher.service")
190+
watcher_status = machine.succeed("systemctl show ghaf-clock-jump-watcher.service --property=ActiveState,UnitFileState")
191+
if "ActiveState=active" not in watcher_status or "UnitFileState=enabled" not in watcher_status:
192+
raise Exception(f"Clock-jump watcher not enabled by default: {watcher_status}")
193+
print(f"Clock-jump recovery watcher status: {watcher_status}")
194+
195+
with subtest("Clock-jump recovery tolerates missing alloy service"):
196+
exit_code, output = machine.execute("systemctl start ghaf-journal-alloy-recover.service 2>&1")
197+
if exit_code != 0:
198+
raise Exception(f"Clock-jump recovery service failed without alloy: {output}")
199+
recover_status = machine.succeed("systemctl show ghaf-journal-alloy-recover.service --property=Result,ExecMainStatus")
200+
if "Result=success" not in recover_status:
201+
raise Exception(f"Clock-jump recovery service did not complete successfully: {recover_status}")
202+
print(f"Clock-jump recovery service completed successfully: {recover_status}")
203+
170204
with subtest("Journal files are created"):
171205
mid = machine.succeed("cat /etc/machine-id").strip()
172206
exit_code, journal_files = machine.execute(f"ls /var/log/journal/{mid}/*.journal 2>/dev/null || ls /run/log/journal/{mid}/*.journal 2>/dev/null")

0 commit comments

Comments
 (0)