Skip to content

Commit 49152f8

Browse files
committed
feat(daemon): add monitoring layer with idle detection, stuck-state restart, and reconnection storm alerts
1 parent 59c7def commit 49152f8

File tree

9 files changed

+551
-15
lines changed

9 files changed

+551
-15
lines changed
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
/**
2+
* Copyright (c) Hathor Labs and its affiliates.
3+
*
4+
* This source code is licensed under the MIT license found in the
5+
* LICENSE file in the root directory of this source tree.
6+
*/
7+
8+
import MonitoringActor from '../../src/actors/MonitoringActor';
9+
import logger from '../../src/logger';
10+
import { EventTypes } from '../../src/types/event';
11+
import getConfig from '../../src/config';
12+
import { addAlert } from '@wallet-service/common';
13+
14+
jest.useFakeTimers();
15+
jest.spyOn(global, 'setInterval');
16+
jest.spyOn(global, 'clearInterval');
17+
18+
jest.mock('@wallet-service/common', () => ({
19+
...jest.requireActual('@wallet-service/common'),
20+
addAlert: jest.fn().mockResolvedValue(undefined),
21+
}));
22+
23+
const mockAddAlert = addAlert as jest.Mock;
24+
25+
describe('MonitoringActor', () => {
26+
let mockCallback: jest.Mock;
27+
let mockReceive: jest.Mock;
28+
let receiveCallback: (event: any) => void;
29+
let config: ReturnType<typeof getConfig>;
30+
31+
const sendEvent = (monitoringEventType: string) => {
32+
receiveCallback({
33+
type: EventTypes.MONITORING_EVENT,
34+
event: { type: monitoringEventType },
35+
});
36+
};
37+
38+
beforeEach(() => {
39+
jest.clearAllMocks();
40+
jest.clearAllTimers();
41+
config = getConfig();
42+
config['IDLE_EVENT_TIMEOUT_MS'] = 5 * 60 * 1000; // 5 min
43+
config['RECONNECTION_STORM_THRESHOLD'] = 3; // low threshold for tests
44+
config['RECONNECTION_STORM_WINDOW_MS'] = 5 * 60 * 1000; // 5 min
45+
46+
mockCallback = jest.fn();
47+
mockReceive = jest.fn().mockImplementation((cb: any) => {
48+
receiveCallback = cb;
49+
});
50+
});
51+
52+
afterAll(() => {
53+
jest.clearAllMocks();
54+
jest.useRealTimers();
55+
});
56+
57+
it('should not start the idle timer on initialization', () => {
58+
MonitoringActor(mockCallback, mockReceive, config);
59+
expect(setInterval).not.toHaveBeenCalled();
60+
});
61+
62+
it('should start the idle timer when receiving a CONNECTED event', () => {
63+
MonitoringActor(mockCallback, mockReceive, config);
64+
sendEvent('CONNECTED');
65+
expect(setInterval).toHaveBeenCalledTimes(1);
66+
});
67+
68+
it('should stop the idle timer when receiving a DISCONNECTED event', () => {
69+
MonitoringActor(mockCallback, mockReceive, config);
70+
sendEvent('CONNECTED');
71+
expect(setInterval).toHaveBeenCalledTimes(1);
72+
73+
sendEvent('DISCONNECTED');
74+
expect(clearInterval).toHaveBeenCalledTimes(1);
75+
});
76+
77+
it('should stop the idle timer when the actor is stopped', () => {
78+
const stopActor = MonitoringActor(mockCallback, mockReceive, config);
79+
sendEvent('CONNECTED');
80+
expect(setInterval).toHaveBeenCalledTimes(1);
81+
82+
stopActor();
83+
expect(clearInterval).toHaveBeenCalledTimes(1);
84+
});
85+
86+
it('should fire an idle alert after IDLE_EVENT_TIMEOUT_MS with no events', async () => {
87+
MonitoringActor(mockCallback, mockReceive, config);
88+
sendEvent('CONNECTED');
89+
90+
// Advance time past the idle timeout
91+
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] + 1);
92+
93+
// Allow the async addAlert promise to resolve
94+
await Promise.resolve();
95+
96+
expect(mockAddAlert).toHaveBeenCalledTimes(1);
97+
expect(mockAddAlert.mock.calls[0][0]).toBe('Daemon Idle — No Events Received');
98+
});
99+
100+
it('should NOT fire an idle alert when events are being received', async () => {
101+
MonitoringActor(mockCallback, mockReceive, config);
102+
sendEvent('CONNECTED');
103+
104+
// Advance to just before the timeout
105+
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] - 1000);
106+
sendEvent('EVENT_RECEIVED');
107+
108+
// Advance past the original threshold (but lastEventReceivedAt was reset)
109+
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] - 1000);
110+
111+
await Promise.resolve();
112+
expect(mockAddAlert).not.toHaveBeenCalled();
113+
});
114+
115+
it('should fire only one idle alert even if the timer fires multiple times', async () => {
116+
MonitoringActor(mockCallback, mockReceive, config);
117+
sendEvent('CONNECTED');
118+
119+
// Fire timer three times without receiving any events
120+
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] * 3);
121+
122+
await Promise.resolve();
123+
124+
expect(mockAddAlert).toHaveBeenCalledTimes(1);
125+
});
126+
127+
it('should reset the idle alert flag when an event is received after an alert fired', async () => {
128+
MonitoringActor(mockCallback, mockReceive, config);
129+
sendEvent('CONNECTED');
130+
131+
// Trigger first alert (interval fires at T = IDLE_EVENT_TIMEOUT_MS)
132+
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] + 1);
133+
await Promise.resolve();
134+
expect(mockAddAlert).toHaveBeenCalledTimes(1);
135+
136+
// Receive an event — resets idleAlertFired and lastEventReceivedAt to current time T1
137+
sendEvent('EVENT_RECEIVED');
138+
139+
// The next interval tick where idleMs >= threshold is at 3*T (the interval at 2*T
140+
// fires only T-1 ms after EVENT_RECEIVED, which is below the threshold).
141+
// Advancing by 2*T from T1 guarantees we cross that boundary.
142+
jest.advanceTimersByTime(2 * config['IDLE_EVENT_TIMEOUT_MS']);
143+
await Promise.resolve();
144+
145+
// A second alert should now be fired
146+
expect(mockAddAlert).toHaveBeenCalledTimes(2);
147+
});
148+
149+
it('should fire a reconnection storm alert when threshold is reached', async () => {
150+
MonitoringActor(mockCallback, mockReceive, config);
151+
152+
// Send enough reconnections to trigger the storm threshold (3 in our test config)
153+
sendEvent('RECONNECTING');
154+
sendEvent('RECONNECTING');
155+
sendEvent('RECONNECTING');
156+
157+
await Promise.resolve();
158+
159+
expect(mockAddAlert).toHaveBeenCalledTimes(1);
160+
expect(mockAddAlert.mock.calls[0][0]).toBe('Daemon Reconnection Storm');
161+
});
162+
163+
it('should NOT fire a reconnection storm alert below the threshold', async () => {
164+
MonitoringActor(mockCallback, mockReceive, config);
165+
166+
// Send fewer reconnections than the threshold
167+
sendEvent('RECONNECTING');
168+
sendEvent('RECONNECTING');
169+
170+
await Promise.resolve();
171+
expect(mockAddAlert).not.toHaveBeenCalled();
172+
});
173+
174+
it('should evict old reconnections outside the storm window', async () => {
175+
MonitoringActor(mockCallback, mockReceive, config);
176+
177+
// Two reconnections at time 0
178+
sendEvent('RECONNECTING');
179+
sendEvent('RECONNECTING');
180+
181+
// Advance past the storm window so those timestamps are evicted
182+
jest.advanceTimersByTime(config['RECONNECTION_STORM_WINDOW_MS'] + 1000);
183+
184+
// One new reconnection — count should restart from 1, no alert
185+
sendEvent('RECONNECTING');
186+
187+
await Promise.resolve();
188+
expect(mockAddAlert).not.toHaveBeenCalled();
189+
});
190+
191+
it('should restart idle timer when CONNECTED is sent while already connected', () => {
192+
MonitoringActor(mockCallback, mockReceive, config);
193+
194+
sendEvent('CONNECTED');
195+
expect(setInterval).toHaveBeenCalledTimes(1);
196+
197+
// A second CONNECTED clears the old timer and creates a new one
198+
sendEvent('CONNECTED');
199+
expect(clearInterval).toHaveBeenCalledTimes(1);
200+
expect(setInterval).toHaveBeenCalledTimes(2);
201+
});
202+
203+
it('should ignore events of other types', () => {
204+
const warnSpy = jest.spyOn(logger, 'warn');
205+
MonitoringActor(mockCallback, mockReceive, config);
206+
207+
receiveCallback({ type: 'SOME_OTHER_EVENT', event: { type: 'WHATEVER' } });
208+
209+
expect(warnSpy).toHaveBeenCalledWith(
210+
'[monitoring] Unexpected event type received by MonitoringActor',
211+
);
212+
expect(setInterval).not.toHaveBeenCalled();
213+
});
214+
});

packages/daemon/src/actions/index.ts

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import logger from '../logger';
1212
import { hashTxData } from '../utils';
1313
import { createStartStreamMessage, createSendAckMessage } from '../actors';
1414
import { bigIntUtils } from '@hathor/wallet-lib';
15+
import { addAlert, Severity } from '@wallet-service/common';
1516

1617
/*
1718
* This action is used to store the initial event id on the context
@@ -197,3 +198,68 @@ export const stopHealthcheckPing = sendTo(
197198
* Logs the event as an error log
198199
*/
199200
export const logEventError = (_context: Context, event: Event) => logger.error(bigIntUtils.JSONBigInt.stringify(event));
201+
202+
/*
203+
* This is a helper to get the monitoring ref from the context and throw if it's not found.
204+
*/
205+
export const getMonitoringRefFromContext = (context: Context) => {
206+
if (!context.monitoring) {
207+
throw new Error('No monitoring actor in context');
208+
}
209+
210+
return context.monitoring;
211+
};
212+
213+
/*
214+
* Notifies the monitoring actor that the WebSocket became connected.
215+
*/
216+
export const sendMonitoringConnected = sendTo(
217+
getMonitoringRefFromContext,
218+
{ type: EventTypes.MONITORING_EVENT, event: { type: 'CONNECTED' } },
219+
);
220+
221+
/*
222+
* Notifies the monitoring actor that the WebSocket disconnected.
223+
*/
224+
export const sendMonitoringDisconnected = sendTo(
225+
getMonitoringRefFromContext,
226+
{ type: EventTypes.MONITORING_EVENT, event: { type: 'DISCONNECTED' } },
227+
);
228+
229+
/*
230+
* Notifies the monitoring actor that a fullnode event was received (resets the idle timer).
231+
*/
232+
export const sendMonitoringEventReceived = sendTo(
233+
getMonitoringRefFromContext,
234+
{ type: EventTypes.MONITORING_EVENT, event: { type: 'EVENT_RECEIVED' } },
235+
);
236+
237+
/*
238+
* Notifies the monitoring actor that the machine is entering the RECONNECTING state.
239+
*/
240+
export const sendMonitoringReconnecting = sendTo(
241+
getMonitoringRefFromContext,
242+
{ type: EventTypes.MONITORING_EVENT, event: { type: 'RECONNECTING' } },
243+
);
244+
245+
/*
246+
* Fires a CRITICAL alert and logs when the machine has been stuck in a processing
247+
* state for longer than STUCK_PROCESSING_TIMEOUT_MS. The machine will transition to
248+
* RECONNECTING immediately after this action runs.
249+
*/
250+
export const alertStuckProcessing = (context: Context) => {
251+
const eventId = context.event?.event?.id;
252+
logger.error(
253+
`[monitoring] State machine stuck processing event ${eventId ?? 'unknown'} for too long — forcing reconnection`,
254+
);
255+
addAlert(
256+
'Daemon Stuck In Processing State',
257+
`The state machine has been processing event ${eventId ?? 'unknown'} ` +
258+
'for longer than the configured timeout. Forcing a reconnection.',
259+
Severity.CRITICAL,
260+
{ eventId: eventId !== undefined ? String(eventId) : 'unknown' },
261+
logger,
262+
).catch((err: Error) =>
263+
logger.error(`[monitoring] Failed to send stuck-processing alert: ${err}`),
264+
);
265+
};

0 commit comments

Comments
 (0)