Skip to content

Commit ee35f20

Browse files
authored
chore: Implement Watchdog process for reliable telemetry (#777)
This PR implements the watchdog process architecture for the telemetry system. It moves the `ClearcutSender` execution to a dedicated child process, ensuring that events—especially shutdown events—are reliably transmitted even if the main server process terminates abruptly. Added an e2e test that runs the server, checks the log file and confirms the telemetry logs exist and that the watchdog process is correctly killed after sending the shutdown event once the main process is killed. **Implementation Roadmap:** This is the fourth in a series of PRs designed to implement the telemetry system: 1. **CLI & Opt-out Mechanism ([Merged](#757 * Added `--usage-statistics` flag and transparency logging. 2. **Logger Scaffolding & Integration ([Merged](#758 * **`ClearcutLogger`**: Implemented the main logging entry point. * **One-way Data Flow**: Integrated `logToolInvocation` and `logServerStart` hooks into `main.ts` to capture events. * **`ClearcutSender`**: Introduced a transport abstraction. * **Type Definitions**: Added TypeScript definitions for the telemetry Protocol Buffer messages. 3. **Persistence Layer ([Merged](#766 * **`FilePersistence`**: Implemented a local file-based state manager to persist the `lastActive` timestamp. * **Daily Active Logic**: Integrated persistence into `ClearcutLogger` to automatically detect and log `daily_active` events (with `days_since_last_active` calculation) via `logDailyActiveIfNeeded`. 4. **Watchdog Process Architecture (This PR):** * **`WatchdogClient`**: Added a client-side wrapper to spawn and communicate with the watchdog process via `stdin`. * **`watchdog/main.ts`**: Created the entry point for the watchdog process. It listens for IPC messages and uses `ClearcutSender` to transmit events. * **Reliable Shutdown**: The watchdog monitors the parent process and guarantees a `shutdown` event is sent when the parent exits or crashes (detecting `stdin` closure). * **Refactoring**: Moved `ClearcutSender` to the `watchdog` directory and updated `ClearcutLogger` to delegate event sending to the `WatchdogClient`. 5. **Transport, Batching & Retries (Next):** * Finalize `ClearcutSender` with actual HTTP transport logic, including event batching and exponential backoff retries.
1 parent a83a338 commit ee35f20

File tree

12 files changed

+762
-80
lines changed

12 files changed

+762
-80
lines changed

src/logger.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,17 @@ export function saveLogsToFile(fileName: string): fs.WriteStream {
3131
return logFile;
3232
}
3333

34+
export function flushLogs(
35+
logFile: fs.WriteStream,
36+
timeoutMs = 2000,
37+
): Promise<void> {
38+
return new Promise((resolve, reject) => {
39+
const timeout = setTimeout(reject, timeoutMs);
40+
logFile.end(() => {
41+
clearTimeout(timeout);
42+
resolve();
43+
});
44+
});
45+
}
46+
3447
export const logger = debug(mcpDebugNamespace);

src/main.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ export const args = parseArguments(VERSION);
3838
const logFile = args.logFile ? saveLogsToFile(args.logFile) : undefined;
3939
let clearcutLogger: ClearcutLogger | undefined;
4040
if (args.usageStatistics) {
41-
clearcutLogger = new ClearcutLogger();
41+
clearcutLogger = new ClearcutLogger({
42+
logFile: args.logFile,
43+
appVersion: VERSION,
44+
});
4245
}
4346

4447
process.on('unhandledRejection', (reason, promise) => {

src/telemetry/clearcut-logger.ts

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,42 +4,75 @@
44
* SPDX-License-Identifier: Apache-2.0
55
*/
66

7+
import process from 'node:process';
8+
79
import {logger} from '../logger.js';
810

9-
import {ClearcutSender} from './clearcut-sender.js';
1011
import type {LocalState, Persistence} from './persistence.js';
1112
import {FilePersistence} from './persistence.js';
12-
import type {FlagUsage} from './types.js';
13+
import {type FlagUsage, WatchdogMessageType, OsType} from './types.js';
14+
import {WatchdogClient} from './watchdog-client.js';
1315

1416
const MS_PER_DAY = 24 * 60 * 60 * 1000;
1517

18+
function detectOsType(): OsType {
19+
switch (process.platform) {
20+
case 'win32':
21+
return OsType.OS_TYPE_WINDOWS;
22+
case 'darwin':
23+
return OsType.OS_TYPE_MACOS;
24+
case 'linux':
25+
return OsType.OS_TYPE_LINUX;
26+
default:
27+
return OsType.OS_TYPE_UNSPECIFIED;
28+
}
29+
}
30+
1631
export class ClearcutLogger {
1732
#persistence: Persistence;
18-
#sender: ClearcutSender;
33+
#watchdog: WatchdogClient;
1934

20-
constructor(options?: {persistence?: Persistence; sender?: ClearcutSender}) {
21-
this.#persistence = options?.persistence ?? new FilePersistence();
22-
this.#sender = options?.sender ?? new ClearcutSender();
35+
constructor(options: {
36+
appVersion: string;
37+
logFile?: string;
38+
persistence?: Persistence;
39+
watchdogClient?: WatchdogClient;
40+
}) {
41+
this.#persistence = options.persistence ?? new FilePersistence();
42+
this.#watchdog =
43+
options.watchdogClient ??
44+
new WatchdogClient({
45+
parentPid: process.pid,
46+
appVersion: options.appVersion,
47+
osType: detectOsType(),
48+
logFile: options.logFile,
49+
});
2350
}
2451

2552
async logToolInvocation(args: {
2653
toolName: string;
2754
success: boolean;
2855
latencyMs: number;
2956
}): Promise<void> {
30-
await this.#sender.send({
31-
tool_invocation: {
32-
tool_name: args.toolName,
33-
success: args.success,
34-
latency_ms: args.latencyMs,
57+
this.#watchdog.send({
58+
type: WatchdogMessageType.LOG_EVENT,
59+
payload: {
60+
tool_invocation: {
61+
tool_name: args.toolName,
62+
success: args.success,
63+
latency_ms: args.latencyMs,
64+
},
3565
},
3666
});
3767
}
3868

3969
async logServerStart(flagUsage: FlagUsage): Promise<void> {
40-
await this.#sender.send({
41-
server_start: {
42-
flag_usage: flagUsage,
70+
this.#watchdog.send({
71+
type: WatchdogMessageType.LOG_EVENT,
72+
payload: {
73+
server_start: {
74+
flag_usage: flagUsage,
75+
},
4376
},
4477
});
4578
}
@@ -57,13 +90,15 @@ export class ClearcutLogger {
5790
daysSince = Math.ceil(diffTime / MS_PER_DAY);
5891
}
5992

60-
await this.#sender.send({
61-
daily_active: {
62-
days_since_last_active: daysSince,
93+
this.#watchdog.send({
94+
type: WatchdogMessageType.LOG_EVENT,
95+
payload: {
96+
daily_active: {
97+
days_since_last_active: daysSince,
98+
},
6399
},
64100
});
65101

66-
// Update persistence
67102
state.lastActive = new Date().toISOString();
68103
await this.#persistence.saveState(state);
69104
}

src/telemetry/clearcut-sender.ts

Lines changed: 0 additions & 15 deletions
This file was deleted.

src/telemetry/types.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@ export interface ChromeDevToolsMcpExtension {
1313
tool_invocation?: ToolInvocation;
1414
server_start?: ServerStart;
1515
daily_active?: DailyActive;
16+
server_shutdown?: ServerShutdown;
1617
}
1718

19+
export type ServerShutdown = Record<string, never>;
20+
1821
export interface ToolInvocation {
1922
tool_name: string;
2023
success: boolean;
@@ -65,3 +68,14 @@ export enum McpClient {
6568
MCP_CLIENT_CLAUDE_CODE = 1,
6669
MCP_CLIENT_GEMINI_CLI = 2,
6770
}
71+
72+
// IPC types for messages between the main process and the
73+
// telemetry watchdog process.
74+
export enum WatchdogMessageType {
75+
LOG_EVENT = 'log-event',
76+
}
77+
78+
export interface WatchdogMessage {
79+
type: WatchdogMessageType.LOG_EVENT;
80+
payload: ChromeDevToolsMcpExtension;
81+
}

src/telemetry/watchdog-client.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import {spawn, type ChildProcess} from 'node:child_process';
8+
import {fileURLToPath} from 'node:url';
9+
10+
import {logger} from '../logger.js';
11+
12+
import type {WatchdogMessage, OsType} from './types.js';
13+
14+
export class WatchdogClient {
15+
#childProcess: ChildProcess;
16+
17+
constructor(
18+
config: {
19+
parentPid: number;
20+
appVersion: string;
21+
osType: OsType;
22+
logFile?: string;
23+
},
24+
options?: {spawn?: typeof spawn},
25+
) {
26+
const watchdogPath = fileURLToPath(
27+
new URL('./watchdog/main.js', import.meta.url),
28+
);
29+
30+
const args = [
31+
watchdogPath,
32+
`--parent-pid=${config.parentPid}`,
33+
`--app-version=${config.appVersion}`,
34+
`--os-type=${config.osType}`,
35+
];
36+
37+
if (config.logFile) {
38+
args.push(`--log-file=${config.logFile}`);
39+
}
40+
41+
const spawner = options?.spawn ?? spawn;
42+
this.#childProcess = spawner(process.execPath, args, {
43+
stdio: ['pipe', 'ignore', 'ignore'],
44+
detached: true,
45+
});
46+
this.#childProcess.unref();
47+
this.#childProcess.on('error', err => {
48+
logger('Watchdog process error:', err);
49+
});
50+
this.#childProcess.on('exit', (code, signal) => {
51+
logger(`Watchdog exited with code ${code} and signal ${signal}`);
52+
});
53+
}
54+
55+
send(message: WatchdogMessage): void {
56+
if (
57+
this.#childProcess.stdin &&
58+
!this.#childProcess.stdin.destroyed &&
59+
this.#childProcess.pid
60+
) {
61+
try {
62+
const line = JSON.stringify(message) + '\n';
63+
this.#childProcess.stdin.write(line);
64+
} catch (err) {
65+
logger('Failed to write to watchdog stdin', err);
66+
}
67+
} else {
68+
logger('Watchdog stdin not available, dropping message');
69+
}
70+
}
71+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import crypto from 'node:crypto';
8+
9+
import {logger} from '../../logger.js';
10+
import type {ChromeDevToolsMcpExtension, OsType} from '../types.js';
11+
12+
const SESSION_ROTATION_INTERVAL_MS = 24 * 60 * 60 * 1000;
13+
14+
export class ClearcutSender {
15+
#appVersion: string;
16+
#osType: OsType;
17+
#sessionId: string;
18+
#sessionCreated: number;
19+
20+
constructor(appVersion: string, osType: OsType) {
21+
this.#appVersion = appVersion;
22+
this.#osType = osType;
23+
this.#sessionId = crypto.randomUUID();
24+
this.#sessionCreated = Date.now();
25+
}
26+
27+
async send(event: ChromeDevToolsMcpExtension): Promise<void> {
28+
this.#rotateSessionIfNeeded();
29+
const enrichedEvent = this.#enrichEvent(event);
30+
this.transport(enrichedEvent);
31+
}
32+
33+
transport(event: ChromeDevToolsMcpExtension): void {
34+
logger('Telemetry event', JSON.stringify(event, null, 2));
35+
}
36+
37+
async sendShutdownEvent(): Promise<void> {
38+
const shutdownEvent: ChromeDevToolsMcpExtension = {
39+
server_shutdown: {},
40+
};
41+
await this.send(shutdownEvent);
42+
}
43+
44+
#rotateSessionIfNeeded(): void {
45+
if (Date.now() - this.#sessionCreated > SESSION_ROTATION_INTERVAL_MS) {
46+
this.#sessionId = crypto.randomUUID();
47+
this.#sessionCreated = Date.now();
48+
}
49+
}
50+
51+
#enrichEvent(event: ChromeDevToolsMcpExtension): ChromeDevToolsMcpExtension {
52+
return {
53+
...event,
54+
session_id: this.#sessionId,
55+
app_version: this.#appVersion,
56+
os_type: this.#osType,
57+
};
58+
}
59+
}

0 commit comments

Comments
 (0)