Skip to content

Commit 289eed9

Browse files
committed
refactor(daemon): stuck-processing only alerts; default 1 hour
1 parent 78ba9c1 commit 289eed9

File tree

5 files changed

+17
-31
lines changed

5 files changed

+17
-31
lines changed

packages/daemon/__tests__/actors/MonitoringActor.test.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -173,18 +173,17 @@ describe('MonitoringActor', () => {
173173
expect(clearTimeout).toHaveBeenCalledTimes(1);
174174
});
175175

176-
it('should fire a CRITICAL alert and call back MONITORING_STUCK_PROCESSING when stuck', async () => {
176+
it('should fire a MAJOR alert when stuck', async () => {
177177
MonitoringActor(mockCallback, mockReceive, config);
178178
sendEvent('PROCESSING_STARTED');
179179

180180
jest.advanceTimersByTime(config['STUCK_PROCESSING_TIMEOUT_MS'] + 1);
181-
// Let the async addAlert inside the timeout resolve
182181
await Promise.resolve();
183182
await Promise.resolve();
184183

185184
expect(mockAddAlert).toHaveBeenCalledTimes(1);
186185
expect(mockAddAlert.mock.calls[0][0]).toBe('Daemon Stuck In Processing State');
187-
expect(mockCallback).toHaveBeenCalledWith({ type: EventTypes.MONITORING_STUCK_PROCESSING });
186+
expect(mockCallback).not.toHaveBeenCalled();
188187
});
189188

190189
it('should NOT fire the stuck alert when PROCESSING_COMPLETED arrives in time', async () => {

packages/daemon/src/actors/MonitoringActor.ts

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ import { Event, EventTypes } from '../types';
3535
* RECONNECTION_STORM_WINDOW_MS.
3636
*/
3737
const DEFAULT_IDLE_EVENT_TIMEOUT_MS = 5 * 60 * 1000;
38-
const DEFAULT_STUCK_PROCESSING_TIMEOUT_MS = 5 * 60 * 1000;
38+
const DEFAULT_STUCK_PROCESSING_TIMEOUT_MS = 60 * 60 * 1000; // 1 hour
3939
const DEFAULT_RECONNECTION_STORM_THRESHOLD = 10;
4040
const DEFAULT_RECONNECTION_STORM_WINDOW_MS = 5 * 60 * 1000;
4141

@@ -93,21 +93,17 @@ export default (callback: any, receive: any, config = getConfig()) => {
9393
const startStuckTimer = () => {
9494
clearStuckTimer();
9595
stuckTimer = setTimeout(async () => {
96-
logger.error('[monitoring] State machine stuck in processing state — forcing reconnection');
97-
try {
98-
await addAlert(
99-
'Daemon Stuck In Processing State',
100-
`The state machine has been processing a single event for more than ` +
101-
`${Math.round(stuckTimeoutMs / 60000)} minute(s). ` +
102-
'Forcing a reconnection.',
103-
Severity.MAJOR,
104-
{ timeoutMs: String(stuckTimeoutMs) },
105-
logger,
106-
);
107-
} catch (err) {
108-
logger.error(`[monitoring] Failed to send stuck-processing alert: ${err}`);
109-
}
110-
callback({ type: EventTypes.MONITORING_STUCK_PROCESSING });
96+
logger.error('[monitoring] State machine stuck in processing state');
97+
addAlert(
98+
'Daemon Stuck In Processing State',
99+
`The state machine has been processing a single event for more than ` +
100+
`${Math.round(stuckTimeoutMs / 60000)} minute(s).`,
101+
Severity.MAJOR,
102+
{ timeoutMs: String(stuckTimeoutMs) },
103+
logger,
104+
).catch((err: Error) =>
105+
logger.error(`[monitoring] Failed to send stuck-processing alert: ${err}`),
106+
);
111107
}, stuckTimeoutMs);
112108
};
113109

packages/daemon/src/config.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ export const ACK_TIMEOUT_MS = parseInt(process.env.ACK_TIMEOUT_MS ?? '20000', 10
9090
// Monitoring configuration
9191
// Timeout (ms) before alerting when no fullnode events received while WebSocket connected
9292
export const IDLE_EVENT_TIMEOUT_MS = parseInt(process.env.IDLE_EVENT_TIMEOUT_MS ?? String(5 * 60 * 1000), 10); // 5 minutes
93-
// Timeout (ms) before auto-restarting when stuck in a single processing state
94-
export const STUCK_PROCESSING_TIMEOUT_MS = parseInt(process.env.STUCK_PROCESSING_TIMEOUT_MS ?? String(5 * 60 * 1000), 10); // 5 minutes
93+
// Timeout (ms) before alerting when stuck in a single processing state
94+
export const STUCK_PROCESSING_TIMEOUT_MS = parseInt(process.env.STUCK_PROCESSING_TIMEOUT_MS ?? String(60 * 60 * 1000), 10); // 1 hour
9595
// Number of reconnections within RECONNECTION_STORM_WINDOW_MS to trigger a storm alert
9696
export const RECONNECTION_STORM_THRESHOLD = parseInt(process.env.RECONNECTION_STORM_THRESHOLD ?? '10', 10);
9797
// Time window (ms) for reconnection storm detection

packages/daemon/src/machines/SyncMachine.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import { WebSocketActor, HealthCheckActor, MonitoringActor } from '../actors';
1515
import {
1616
Context,
1717
Event,
18-
EventTypes,
1918
} from '../types';
2019
import {
2120
handleVertexAccepted,
@@ -392,10 +391,6 @@ export const SyncMachine = Machine<Context, any, Event>({
392391
cond: 'websocketDisconnected',
393392
target: SYNC_MACHINE_STATES.RECONNECTING,
394393
}],
395-
// Sent by MonitoringActor when a processing state has been active for too long
396-
[EventTypes.MONITORING_STUCK_PROCESSING]: {
397-
target: `#SyncMachine.${SYNC_MACHINE_STATES.RECONNECTING}`,
398-
},
399394
},
400395
},
401396
[SYNC_MACHINE_STATES.ERROR]: {

packages/daemon/src/types/event.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ export enum EventTypes {
4242
WEBSOCKET_SEND_EVENT = 'WEBSOCKET_SEND_EVENT',
4343
HEALTHCHECK_EVENT = 'HEALTHCHECK_EVENT',
4444
MONITORING_EVENT = 'MONITORING_EVENT',
45-
MONITORING_STUCK_PROCESSING = 'MONITORING_STUCK_PROCESSING',
4645
}
4746

4847
export enum FullNodeEventTypes {
@@ -83,10 +82,7 @@ export type Event =
8382
| { type: EventTypes.FULLNODE_EVENT, event: FullNodeEvent }
8483
| { type: EventTypes.WEBSOCKET_SEND_EVENT, event: WebSocketSendEvent }
8584
| { type: EventTypes.HEALTHCHECK_EVENT, event: HealthCheckEvent }
86-
| { type: EventTypes.MONITORING_EVENT, event: MonitoringEvent }
87-
// Machine-internal signal sent by MonitoringActor when a processing state is stuck.
88-
// `event: never` keeps the union consistent (all members have an `event` field).
89-
| { type: EventTypes.MONITORING_STUCK_PROCESSING, event: never };
85+
| { type: EventTypes.MONITORING_EVENT, event: MonitoringEvent };
9086

9187

9288
export interface VertexRemovedEventData {

0 commit comments

Comments
 (0)