Skip to content

Commit fd4e0e1

Browse files
authored
Merge pull request #253 from itsDNNS/feat/60-modem-restart-detection
feat: modem restart detection via error counter reset (#60)
2 parents 76d0b01 + 13011f2 commit fd4e0e1

File tree

7 files changed

+600
-2
lines changed

7 files changed

+600
-2
lines changed

app/event_detector.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
POWER_SHIFT_THRESHOLD = 2.0 # dBmV shift to trigger power_change
1212
UNCORR_SPIKE_THRESHOLD = 1000
1313

14+
# Restart detection thresholds
15+
RESTART_CHANNEL_THRESHOLD = 0.8 # 80% of valid channels must be declining
16+
RESTART_MIN_OVERLAP = 4 # Minimum overlapping channels for fair comparison
17+
RESTART_MIN_CONTINUITY = 0.5 # Minimum overlap ratio (vs either snapshot)
18+
1419
# Import SNR thresholds from analyzer (loaded from thresholds.json)
1520
from app.analyzer import _get_snr_thresholds as _snr_thresholds
1621

@@ -67,6 +72,8 @@ def check(self, analysis):
6772
self._check_channels(events, ts, cur_s, prev_s)
6873
# Modulation change
6974
self._check_modulation(events, ts, analysis, prev)
75+
# Restart detection (before errors — restart causes negative delta)
76+
self._check_restart(events, ts, analysis, prev)
7077
# Error spike
7178
self._check_errors(events, ts, cur_s, prev_s)
7279

@@ -272,3 +279,86 @@ def _check_errors(self, events, ts, cur, prev):
272279
"message": f"Uncorrectable errors jumped by {delta:,} (from {uncorr_prev:,} to {uncorr_cur:,})",
273280
"details": {"prev": uncorr_prev, "current": uncorr_cur, "delta": delta},
274281
})
282+
283+
def _check_restart(self, events, ts, cur, prev):
284+
"""Detect modem restart via per-channel error counter reset."""
285+
prev_channels = {ch["channel_id"]: ch for ch in prev.get("ds_channels", [])}
286+
cur_channels = {ch["channel_id"]: ch for ch in cur.get("ds_channels", [])}
287+
288+
overlap_ids = set(prev_channels.keys()) & set(cur_channels.keys())
289+
290+
# Guard: insufficient continuity
291+
prev_count = len(prev_channels)
292+
cur_count = len(cur_channels)
293+
if len(overlap_ids) < RESTART_MIN_OVERLAP:
294+
return
295+
if prev_count > 0 and len(overlap_ids) / prev_count < RESTART_MIN_CONTINUITY:
296+
return
297+
if cur_count > 0 and len(overlap_ids) / cur_count < RESTART_MIN_CONTINUITY:
298+
return
299+
300+
# Count channels with declining counters
301+
valid_channels = 0
302+
declining_channels = 0
303+
304+
for ch_id in overlap_ids:
305+
p = prev_channels[ch_id]
306+
c = cur_channels[ch_id]
307+
p_corr = p.get("correctable_errors")
308+
p_uncorr = p.get("uncorrectable_errors")
309+
c_corr = c.get("correctable_errors")
310+
c_uncorr = c.get("uncorrectable_errors")
311+
312+
# Evaluate each counter family independently.
313+
# A channel is valid if at least one counter pair is comparable.
314+
# A channel is declining if at least one counter declined and none increased.
315+
has_corr = p_corr is not None and c_corr is not None
316+
has_uncorr = p_uncorr is not None and c_uncorr is not None
317+
318+
if not has_corr and not has_uncorr:
319+
continue # No comparable counters at all
320+
321+
valid_channels += 1
322+
corr_declined = has_corr and c_corr < p_corr
323+
uncorr_declined = has_uncorr and c_uncorr < p_uncorr
324+
corr_ok = not has_corr or c_corr <= p_corr
325+
uncorr_ok = not has_uncorr or c_uncorr <= p_uncorr
326+
if (corr_declined or uncorr_declined) and corr_ok and uncorr_ok:
327+
declining_channels += 1
328+
329+
if valid_channels < RESTART_MIN_OVERLAP:
330+
return
331+
if declining_channels / valid_channels < RESTART_CHANNEL_THRESHOLD:
332+
return
333+
334+
# Sanity check: at least one summary total must decline.
335+
# If either snapshot is missing the summary keys entirely, skip the
336+
# sanity check (rely on per-channel signal alone) rather than
337+
# defaulting to 0 which would create false positives.
338+
prev_s = prev.get("summary", {})
339+
cur_s = cur.get("summary", {})
340+
prev_corr_total = prev_s.get("ds_correctable_errors")
341+
prev_uncorr_total = prev_s.get("ds_uncorrectable_errors")
342+
cur_corr_total = cur_s.get("ds_correctable_errors")
343+
cur_uncorr_total = cur_s.get("ds_uncorrectable_errors")
344+
345+
# Only enforce sanity check if all four values are present
346+
if all(v is not None for v in (prev_corr_total, prev_uncorr_total,
347+
cur_corr_total, cur_uncorr_total)):
348+
if cur_corr_total >= prev_corr_total and cur_uncorr_total >= prev_uncorr_total:
349+
return # Neither total declining
350+
351+
events.append({
352+
"timestamp": ts,
353+
"severity": "info",
354+
"event_type": "modem_restart_detected",
355+
"message": "Detected modem restart or counter reset pattern",
356+
"details": {
357+
"affected_channels": declining_channels,
358+
"total_channels": valid_channels,
359+
"prev_corr_total": prev_corr_total,
360+
"prev_uncorr_total": prev_uncorr_total,
361+
"current_corr_total": cur_corr_total,
362+
"current_uncorr_total": cur_uncorr_total,
363+
},
364+
})

app/i18n/de.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"settings": "Einstellungen",
2020
"nav_customize": "Navigation anpassen",
2121
"nav_customize_desc": "Ordne Ansichten pro Bereich neu an und hefte die wichtigsten an die mobile Leiste an.",
22-
"nav_reset": "Zurucksetzen",
22+
"nav_reset": "Zurücksetzen",
2323
"nav_done": "Fertig",
2424
"nav_more": "Mehr",
2525
"nav_pinned": "An mobile Leiste angeheftet",
@@ -314,6 +314,7 @@
314314
"event_type_snr_change": "SNR-Änderung",
315315
"event_type_channel_change": "Kanaländerung",
316316
"event_type_modulation_change": "Modulationsänderung",
317+
"event_type_modem_restart_detected": "Modem-Neustart erkannt",
317318
"event_type_error_spike": "Fehleranstieg",
318319
"event_acknowledged": "Bestätigt",
319320
"event_all_severities": "Alle Schweregrade",
@@ -866,4 +867,4 @@
866867
"extensions_verified": "Verifiziert",
867868
"extensions_no_modules": "Keine Community-Module verfügbar.",
868869
"extensions_fetch_failed": "Registry konnte nicht geladen werden."
869-
}
870+
}

app/i18n/en.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@
314314
"event_type_snr_change": "SNR Change",
315315
"event_type_channel_change": "Channel Change",
316316
"event_type_modulation_change": "Modulation Change",
317+
"event_type_modem_restart_detected": "Modem restart detected",
317318
"event_type_error_spike": "Error Spike",
318319
"event_acknowledged": "Acknowledged",
319320
"event_all_severities": "All Severities",

app/i18n/es.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@
309309
"event_type_snr_change": "Cambio de SNR",
310310
"event_type_channel_change": "Cambio de canal",
311311
"event_type_modulation_change": "Cambio de modulacion",
312+
"event_type_modem_restart_detected": "Reinicio del módem detectado",
312313
"event_type_error_spike": "Pico de errores",
313314
"event_acknowledged": "Confirmado",
314315
"event_all_severities": "Todas las severidades",

app/i18n/fr.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@
306306
"event_type_snr_change": "Changement de SNR",
307307
"event_type_channel_change": "Changement de canal",
308308
"event_type_modulation_change": "Changement de modulation",
309+
"event_type_modem_restart_detected": "Redémarrage du modem détecté",
309310
"event_type_error_spike": "Pic d'erreurs",
310311
"event_acknowledged": "Confirme",
311312
"event_all_severities": "Toutes les gravites",
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# Modem Restart Detection - Design Spec
2+
3+
**Date:** 2026-03-17
4+
**Issue:** #60
5+
**Status:** Draft
6+
7+
## Summary
8+
9+
Detect modem restarts by observing per-channel DOCSIS error counter resets between consecutive polling cycles. When a modem reboots, its cumulative error counters (correctable and uncorrectable) reset to zero. By comparing per-channel counters across snapshots, DOCSight can emit a `modem_restart_detected` event with high confidence.
10+
11+
## Goals
12+
13+
- Detect silent modem restarts that users would otherwise miss
14+
- Emit an event that appears in the Event Log and Correlation Timeline
15+
- Avoid false positives from channel map changes, parser anomalies, or first-poll scenarios
16+
17+
## Non-Goals
18+
19+
- Dashboard restart marker or dedicated UI surface (event in log is sufficient for MVP)
20+
- Device info refresh after restart (separate scope, requires collector I/O changes)
21+
- Aggressive polling protection or crash-loop detection (different problem class)
22+
- Connection Monitor correlation as a detection condition (useful for timeline context, not as primary signal)
23+
24+
## Architecture
25+
26+
### Location
27+
28+
New method `_check_restart()` in `EventDetector` (`app/event_detector.py`), called from `check()` before `_check_errors()`.
29+
30+
Follows the exact pattern of `_check_errors()` (lines 262-274): compare prev vs current summary/channel data, emit event dict if threshold met.
31+
32+
### Detection Algorithm
33+
34+
```python
35+
def _check_restart(self, events, ts, cur, prev):
36+
"""Detect modem restart via per-channel error counter reset.
37+
38+
Follows _check_modulation pattern: receives full analysis dicts,
39+
mutates events list in-place, returns None.
40+
"""
41+
# 1. Build channel lookup by channel_id for both snapshots
42+
prev_channels = {ch["channel_id"]: ch for ch in prev.get("ds_channels", [])}
43+
cur_channels = {ch["channel_id"]: ch for ch in cur.get("ds_channels", [])}
44+
45+
# 2. Find overlapping channels (present in both snapshots)
46+
overlap_ids = set(prev_channels.keys()) & set(cur_channels.keys())
47+
48+
# 3. Guard: insufficient continuity
49+
prev_count = len(prev_channels)
50+
cur_count = len(cur_channels)
51+
if len(overlap_ids) < RESTART_MIN_OVERLAP:
52+
return
53+
if prev_count > 0 and len(overlap_ids) / prev_count < RESTART_MIN_CONTINUITY:
54+
return
55+
if cur_count > 0 and len(overlap_ids) / cur_count < RESTART_MIN_CONTINUITY:
56+
return
57+
58+
# 4. Count channels with declining counters
59+
# A channel is "declining" if at least one counter type decreased
60+
# and neither counter increased. This correctly handles the common
61+
# case where uncorrectable_errors was already 0 before restart:
62+
# (N, 0) → (0, 0) counts as declining because correctable declined
63+
# and uncorrectable stayed the same.
64+
valid_channels = 0
65+
declining_channels = 0
66+
67+
for ch_id in overlap_ids:
68+
p = prev_channels[ch_id]
69+
c = cur_channels[ch_id]
70+
p_corr = p.get("correctable_errors")
71+
p_uncorr = p.get("uncorrectable_errors")
72+
c_corr = c.get("correctable_errors")
73+
c_uncorr = c.get("uncorrectable_errors")
74+
75+
# Skip if any counter is None (invalid data)
76+
if any(v is None for v in (p_corr, p_uncorr, c_corr, c_uncorr)):
77+
continue
78+
79+
valid_channels += 1
80+
corr_declined = c_corr < p_corr
81+
uncorr_declined = c_uncorr < p_uncorr
82+
corr_ok = c_corr <= p_corr
83+
uncorr_ok = c_uncorr <= p_uncorr
84+
if (corr_declined or uncorr_declined) and corr_ok and uncorr_ok:
85+
declining_channels += 1
86+
87+
# 5. Guard: not enough valid channels
88+
if valid_channels < RESTART_MIN_OVERLAP:
89+
return
90+
91+
# 6. Primary signal: >=80% of valid overlapping channels are declining
92+
if declining_channels / valid_channels < RESTART_CHANNEL_THRESHOLD:
93+
return
94+
95+
# 7. Sanity check: at least one summary total must decline
96+
prev_summary = prev.get("summary", {})
97+
cur_summary = cur.get("summary", {})
98+
prev_corr_total = prev_summary.get("ds_correctable_errors", 0)
99+
prev_uncorr_total = prev_summary.get("ds_uncorrectable_errors", 0)
100+
cur_corr_total = cur_summary.get("ds_correctable_errors", 0)
101+
cur_uncorr_total = cur_summary.get("ds_uncorrectable_errors", 0)
102+
103+
if cur_corr_total >= prev_corr_total and cur_uncorr_total >= prev_uncorr_total:
104+
return # Neither total declining — not a restart
105+
106+
# 8. Emit event
107+
events.append({
108+
"timestamp": ts,
109+
"severity": "info",
110+
"event_type": "modem_restart_detected",
111+
"message": "Detected modem restart or counter reset pattern",
112+
"details": {
113+
"affected_channels": declining_channels,
114+
"total_channels": valid_channels,
115+
"prev_corr_total": prev_corr_total,
116+
"prev_uncorr_total": prev_uncorr_total,
117+
"current_corr_total": cur_corr_total,
118+
"current_uncorr_total": cur_uncorr_total,
119+
},
120+
})
121+
```
122+
123+
### Integration into check()
124+
125+
In `check()`, insert before `_check_errors` (between lines 69 and 70, after `_check_modulation`):
126+
127+
```python
128+
self._check_restart(events, ts, analysis, prev)
129+
```
130+
131+
Follows `_check_modulation`'s pattern: receives full analysis dicts, mutates `events` in-place, returns None.
132+
133+
### Constants
134+
135+
```python
136+
RESTART_CHANNEL_THRESHOLD = 0.8 # 80% of valid channels must be declining
137+
RESTART_MIN_OVERLAP = 4 # Minimum overlapping channels for fair comparison
138+
RESTART_MIN_CONTINUITY = 0.5 # Minimum overlap ratio (vs either snapshot)
139+
```
140+
141+
Defined at module level in `event_detector.py`, alongside the existing `UNCORR_SPIKE_THRESHOLD`.
142+
143+
## Edge Cases
144+
145+
| Scenario | Behavior |
146+
|---|---|
147+
| First poll after DOCSight start | `prev is None``check()` returns early, `_check_restart` never called |
148+
| Channel map change (new channel IDs) | Overlap drops below 50% → no restart verdict |
149+
| Single channel reset (partial) | Less than 80% declining → no restart verdict |
150+
| All counters were already 0 | Neither counter declined (0 < 0 is false) → not counted as declining. But channels with (N, 0) → (0, 0) ARE counted because correctable declined. |
151+
| Some channels have None counters | Skipped from valid_channels count entirely |
152+
| Counter wrap (32-bit overflow) | Would appear as decline, but extremely rare. Accepted as false positive. |
153+
| Modem restart + channel map change | If ≥50% overlap and ≥80% declining in overlap → still detected |
154+
| Driver returns empty channel list | `overlap_ids` empty → guard catches it |
155+
156+
## Event Format
157+
158+
```json
159+
{
160+
"timestamp": "2026-03-17T14:30:00Z",
161+
"severity": "info",
162+
"event_type": "modem_restart_detected",
163+
"message": "Detected modem restart or counter reset pattern",
164+
"details": {
165+
"affected_channels": 12,
166+
"total_channels": 14,
167+
"prev_corr_total": 45230,
168+
"prev_uncorr_total": 128,
169+
"current_corr_total": 0,
170+
"current_uncorr_total": 0
171+
}
172+
}
173+
```
174+
175+
Severity is `info` (not `warning`) because a restart is an observation, not necessarily a problem. The event appears in the Event Log and Correlation Timeline alongside other events.
176+
177+
## Testing Strategy
178+
179+
### Unit Tests
180+
181+
- `test_restart_detected` — all channels reset to 0
182+
- `test_restart_partial_channels` — 85% declining, 15% unchanged → detected
183+
- `test_no_restart_below_threshold` — 70% declining → not detected
184+
- `test_no_restart_first_poll` — prev is None → no event
185+
- `test_no_restart_counters_increasing` — normal operation, counters go up
186+
- `test_no_restart_insufficient_overlap` — channel map changed, <50% overlap
187+
- `test_no_restart_too_few_channels` — only 3 overlapping channels
188+
- `test_no_restart_totals_not_declining` — per-channel declining but totals still up (edge case)
189+
- `test_no_restart_none_counters` — channels with None values skipped
190+
- `test_restart_with_some_channel_change` — overlap ≥50%, channels declining → detected
191+
- `test_channels_already_zero` — 0→0 not counted as declining
192+
- `test_restart_does_not_trigger_error_spike` — restart pattern does not also emit error_spike (negative delta)
193+
- `test_channel_with_zero_uncorr_before_restart` — (N, 0) → (0, 0) correctly counted as declining
194+
195+
### Integration
196+
197+
- ModemCollector poll cycle with mock data showing restart pattern → event emitted and stored
198+
199+
## i18n
200+
201+
One new key per language (EN/DE/FR/ES), following the existing `event_type_<event_type>` pattern used by the Event Log UI:
202+
- `event_type_modem_restart_detected`: "Modem restart detected" / "Modem-Neustart erkannt" / "Redémarrage du modem détecté" / "Reinicio del módem detectado"
203+
204+
## Migration
205+
206+
No schema changes. Uses existing `events` table. No new config keys.

0 commit comments

Comments
 (0)