Skip to content

Commit 8268f32

Browse files
authored
[local ssh] improve observability and try to prevent hanging (#90)
- add started/existed status reporting to identify hanging process - don't swallow errors, but report all, we can sort them out later to investigate root causes of hanging - add detection of stale web sockets to prevent hanging
1 parent 5088d1c commit 8268f32

File tree

3 files changed

+140
-79
lines changed

3 files changed

+140
-79
lines changed

src/common/telemetry.ts

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ export function getErrorMetricsEndpoint(gitpodHost: string): string {
6868
return `https://ide.${serviceUrl.hostname}/metrics-api/reportError`;
6969
}
7070

71-
export function commonSendEventData(logService: ILogService, segmentClient: Analytics, machineId: string, eventName: string, data?: any) {
71+
export async function commonSendEventData(logService: ILogService, segmentClient: Analytics, machineId: string, eventName: string, data?: any): Promise<void> {
7272
const properties = data ?? {};
7373

7474
delete properties['gitpodHost'];
@@ -77,16 +77,17 @@ export function commonSendEventData(logService: ILogService, segmentClient: Anal
7777
logService.trace('Local event report', eventName, properties);
7878
return;
7979
}
80-
81-
segmentClient.track({
82-
anonymousId: machineId,
83-
event: eventName,
84-
properties
85-
}, (err) => {
86-
if (err) {
87-
logService.error('Failed to log event to app analytics:', err);
88-
}
89-
});
80+
return new Promise((resolve) =>
81+
segmentClient.track({
82+
anonymousId: machineId,
83+
event: eventName,
84+
properties
85+
}, (err) => {
86+
if (err) {
87+
logService.error('Failed to log event to app analytics:', err);
88+
}
89+
resolve();
90+
}))
9091
}
9192

9293
interface SendErrorDataOptions {

src/local-ssh/proxy.ts

Lines changed: 122 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,73 @@
33
* Licensed under the MIT License. See License.txt in the project root for license information.
44
*--------------------------------------------------------------------------------------------*/
55

6+
interface ClientOptions {
7+
host: string;
8+
extIpcPort: number;
9+
machineID: string;
10+
}
11+
12+
function getClientOptions(): ClientOptions {
13+
const args = process.argv.slice(2);
14+
// %h is in the form of <ws_id>.vss.<gitpod_host>'
15+
// add `https://` prefix since our gitpodHost is actually a url not host
16+
const host = 'https://' + args[0].split('.').splice(2).join('.');
17+
return {
18+
host,
19+
extIpcPort: Number.parseInt(args[1], 10),
20+
machineID: args[2] ?? '',
21+
};
22+
}
23+
24+
const options = getClientOptions();
25+
if (!options) {
26+
process.exit(1);
27+
}
28+
29+
import { NopeLogger } from './logger';
30+
const logService = new NopeLogger();
31+
32+
// DO NOT PUSH CHANGES BELOW TO PRODUCTION
33+
// import { DebugLogger } from './logger';
34+
// const logService = new DebugLogger();
35+
36+
import { TelemetryService } from './telemetryService';
37+
const telemetryService = new TelemetryService(
38+
process.env.SEGMENT_KEY!,
39+
options.machineID,
40+
process.env.EXT_NAME!,
41+
process.env.EXT_VERSION!,
42+
options.host,
43+
logService
44+
);
45+
46+
const flow: SSHUserFlowTelemetry = {
47+
flow: 'local_ssh',
48+
gitpodHost: options.host,
49+
workspaceId: '',
50+
processId: process.pid,
51+
};
52+
53+
telemetryService.sendUserFlowStatus('started', flow);
54+
const sendExited = (exitCode: number, forceExit: boolean, exitSignal?: NodeJS.Signals) => {
55+
return telemetryService.sendUserFlowStatus('exited', {
56+
...flow,
57+
exitCode,
58+
forceExit: String(forceExit),
59+
signal: exitSignal
60+
});
61+
};
62+
// best effort to intercept process exit
63+
const beforeExitListener = (exitCode: number) => {
64+
process.removeListener('beforeExit', beforeExitListener);
65+
return sendExited(exitCode, false)
66+
};
67+
process.addListener('beforeExit', beforeExitListener);
68+
const exitProcess = async (forceExit: boolean, signal?: NodeJS.Signals) => {
69+
await sendExited(0, forceExit, signal);
70+
process.exit(0);
71+
};
72+
673
import { SshClient } from '@microsoft/dev-tunnels-ssh-tcp';
774
import { NodeStream, SshClientCredentials, SshClientSession, SshDisconnectReason, SshServerSession, SshSessionConfiguration, Stream, WebSocketStream } from '@microsoft/dev-tunnels-ssh';
875
import { importKey, importKeyBytes } from '@microsoft/dev-tunnels-ssh-keys';
@@ -13,7 +80,6 @@ import { WrapError } from '../common/utils';
1380
import { WebSocket } from 'ws';
1481
import * as stream from 'stream';
1582
import { ILogService } from '../services/logService';
16-
import { TelemetryService } from './telemetryService';
1783
import { ITelemetryService, UserFlowTelemetryProperties } from '../common/telemetry';
1884
import { LocalSSHMetricsReporter } from '../services/localSSHMetrics';
1985

@@ -25,24 +91,6 @@ function getHostKey(): Buffer {
2591
return Buffer.from(HOST_KEY, 'base64');
2692
}
2793

28-
interface ClientOptions {
29-
host: string;
30-
extIpcPort: number;
31-
machineID: string;
32-
}
33-
34-
function getClientOptions(): ClientOptions {
35-
const args = process.argv.slice(2);
36-
// %h is in the form of <ws_id>.vss.<gitpod_host>'
37-
// add `https://` prefix since our gitpodHost is actually a url not host
38-
const host = 'https://' + args[0].split('.').splice(2).join('.');
39-
return {
40-
host,
41-
extIpcPort: Number.parseInt(args[1], 10),
42-
machineID: args[2] ?? '',
43-
};
44-
}
45-
4694
type FailedToProxyCode = 'SSH.AuthenticationFailed' | 'TUNNEL.AuthenticateSSHKeyFailed' | 'NoRunningInstance' | 'FailedToGetAuthInfo' | 'GitpodHostMismatch' | 'NoAccessTokenFound';
4795

4896
// IgnoredFailedCodes contains the failreCode that don't need to send error report
@@ -74,28 +122,22 @@ interface SSHUserFlowTelemetry extends UserFlowTelemetryProperties {
74122

75123
class WebSocketSSHProxy {
76124
private extensionIpc: Client<ExtensionServiceDefinition>;
77-
private flow: SSHUserFlowTelemetry;
78125

79126
constructor(
80127
private readonly options: ClientOptions,
81128
private readonly telemetryService: ITelemetryService,
82129
private readonly metricsReporter: LocalSSHMetricsReporter,
83-
private readonly logService: ILogService
130+
private readonly logService: ILogService,
131+
private readonly flow: SSHUserFlowTelemetry
84132
) {
85-
this.flow = {
86-
flow: 'local_ssh',
87-
gitpodHost: this.options.host,
88-
workspaceId: '',
89-
};
90-
91133
this.onExit();
92134
this.onException();
93135
this.extensionIpc = createClient(ExtensionServiceDefinition, createChannel('127.0.0.1:' + this.options.extIpcPort));
94136
}
95137

96138
private onExit() {
97-
const exitHandler = (_signal?: NodeJS.Signals) => {
98-
process.exit(0);
139+
const exitHandler = (signal?: NodeJS.Signals) => {
140+
exitProcess(false, signal)
99141
};
100142
process.on('SIGINT', exitHandler);
101143
process.on('SIGTERM', exitHandler);
@@ -116,19 +158,21 @@ class WebSocketSSHProxy {
116158
// an error handler to the writable stream
117159
const sshStream = stream.Duplex.from({ readable: process.stdin, writable: process.stdout });
118160
sshStream.on('error', e => {
119-
if ((e as any).code === 'EPIPE') {
120-
// HACK:
121-
// Seems there's a bug in the ssh library that could hang forever when the stream gets closed
122-
// so the below `await pipePromise` will never return and the node process will never exit.
123-
// So let's just force kill here
124-
setTimeout(() => process.exit(0), 50);
161+
if ((e as any).code !== 'EPIPE') {
162+
// TODO filter out known error codes
163+
this.logService.error(e, 'unexpected sshStream error');
125164
}
165+
// HACK:
166+
// Seems there's a bug in the ssh library that could hang forever when the stream gets closed
167+
// so the below `await pipePromise` will never return and the node process will never exit.
168+
// So let's just force kill here
169+
setTimeout(() => exitProcess(true), 50);
126170
});
127171
// sshStream.on('end', () => {
128-
// setTimeout(() => process.exit(0), 50);
172+
// setTimeout(() => doProcessExit(0), 50);
129173
// });
130174
// sshStream.on('close', () => {
131-
// setTimeout(() => process.exit(0), 50);
175+
// setTimeout(() => doProcessExit(0), 50);
132176
// });
133177

134178
// This is expected to never throw as key is hardcoded
@@ -227,10 +271,46 @@ class WebSocketSSHProxy {
227271
'x-gitpod-owner-token': workspaceInfo.ownerToken
228272
}
229273
});
274+
230275
socket.binaryType = 'arraybuffer';
231276

232277
const stream = await new Promise<Stream>((resolve, reject) => {
233-
socket.onopen = () => resolve(new WebSocketStream(socket as any));
278+
socket.onopen = () => {
279+
// see https://github.com/gitpod-io/gitpod/blob/a5b4a66e0f384733145855f82f77332062e9d163/components/gitpod-protocol/go/websocket.go#L31-L40
280+
const pongPeriod = 15 * 1000;
281+
const pingPeriod = pongPeriod * 9 / 10;
282+
283+
let pingTimeout: NodeJS.Timeout | undefined;
284+
const heartbeat = () => {
285+
stopHearbeat();
286+
287+
// Use `WebSocket#terminate()`, which immediately destroys the connection,
288+
// instead of `WebSocket#close()`, which waits for the close timer.
289+
// Delay should be equal to the interval at which your server
290+
// sends out pings plus a conservative assumption of the latency.
291+
pingTimeout = setTimeout(() => {
292+
// TODO(ak) if we see stale socket.terminate();
293+
this.telemetryService.sendUserFlowStatus('stale', this.flow);
294+
}, pingPeriod + 1000);
295+
}
296+
function stopHearbeat() {
297+
if (pingTimeout != undefined) {
298+
clearTimeout(pingTimeout);
299+
pingTimeout = undefined;
300+
}
301+
}
302+
303+
socket.on('ping', heartbeat);
304+
305+
heartbeat();
306+
const socketWrapper = new WebSocketStream(socket as any);
307+
const wrappedOnClose = socket.onclose!;
308+
socket.onclose = (e) => {
309+
stopHearbeat();
310+
wrappedOnClose(e);
311+
}
312+
resolve(socketWrapper);
313+
}
234314
socket.onerror = (e) => reject(e);
235315
});
236316

@@ -281,30 +361,10 @@ class WebSocketSSHProxy {
281361
}
282362
}
283363

284-
const options = getClientOptions();
285-
if (!options) {
286-
process.exit(1);
287-
}
288-
289-
import { NopeLogger } from './logger';
290-
const logService = new NopeLogger();
291-
292-
// DO NOT PUSH CHANGES BELOW TO PRODUCTION
293-
// import { DebugLogger } from './logger';
294-
// const logService = new DebugLogger();
295-
296-
const telemetryService = new TelemetryService(
297-
process.env.SEGMENT_KEY!,
298-
options.machineID,
299-
process.env.EXT_NAME!,
300-
process.env.EXT_VERSION!,
301-
options.host,
302-
logService
303-
);
304-
305364
const metricsReporter = new LocalSSHMetricsReporter(logService);
306365

307-
const proxy = new WebSocketSSHProxy(options, telemetryService, metricsReporter, logService);
308-
proxy.start().catch(() => {
309-
// Noop, catch everything in start method pls
366+
const proxy = new WebSocketSSHProxy(options, telemetryService, metricsReporter, logService, flow);
367+
proxy.start().catch(e => {
368+
const err = new WrapError('Uncaught exception on start method', e);
369+
telemetryService.sendTelemetryException(err, { gitpodHost: options.host });
310370
});

src/local-ssh/telemetryService.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ export class TelemetryService implements ITelemetryService {
3131
this.commonProperties = commonProperties;
3232
}
3333

34-
sendEventData(eventName: string, data?: Record<string, any>) {
34+
async sendEventData(eventName: string, data?: Record<string, any>): Promise<void> {
3535
const properties = mixin(cleanData(data ?? {}, this.cleanupPatterns, isTrustedValue), this.commonProperties);
3636

3737
if (!this.segmentClient) {
3838
return;
3939
}
4040

41-
commonSendEventData(this.logService, this.segmentClient, this.machineId, eventName, properties);
41+
return commonSendEventData(this.logService, this.segmentClient, this.machineId, eventName, properties);
4242
}
4343

4444
sendErrorData(error: Error, data?: Record<string, any>) {
@@ -53,20 +53,20 @@ export class TelemetryService implements ITelemetryService {
5353
// Noop, we disabled buffering
5454
}
5555

56-
sendTelemetryEvent(eventName: string, properties?: TelemetryEventProperties): void {
56+
sendTelemetryEvent(eventName: string, properties?: TelemetryEventProperties): Promise<void> {
5757
const props = properties ? Object.fromEntries(Object.entries(properties).map(([k, v]) => [k, TRUSTED_VALUES.has(k) ? new TelemetryTrustedValue(v) : v])) : undefined;
58-
this.sendEventData(eventName, props);
58+
return this.sendEventData(eventName, props);
5959
}
6060

6161
sendTelemetryException(error: Error, properties?: TelemetryEventProperties): void {
6262
const props = properties ? Object.fromEntries(Object.entries(properties).map(([k, v]) => [k, TRUSTED_VALUES.has(k) ? new TelemetryTrustedValue(v) : v])) : undefined;
6363
this.sendErrorData(error, props);
6464
}
6565

66-
sendUserFlowStatus(status: string, flowProperties: UserFlowTelemetryProperties): void {
66+
sendUserFlowStatus(status: string, flowProperties: UserFlowTelemetryProperties): Promise<void> {
6767
const properties: TelemetryEventProperties = { ...flowProperties, status };
6868
delete properties['flow'];
69-
this.sendTelemetryEvent('vscode_desktop_' + flowProperties.flow, properties);
69+
return this.sendTelemetryEvent('vscode_desktop_' + flowProperties.flow, properties);
7070
}
7171
}
7272

0 commit comments

Comments
 (0)