Skip to content

Commit ae9fd49

Browse files
committed
feat: flush telemetry when process terminates
* DASH0_FLUSH_ON_SIGTERM_SIGINT (opt-in) * DASH0_FLUSH_ON_EMPTY_EVENT_LOOP (opt-out)
1 parent eac93d9 commit ae9fd49

File tree

6 files changed

+212
-4
lines changed

6 files changed

+212
-4
lines changed

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,32 @@ Disables the Dash0 Node.js distribution entirely.
4242

4343
By default, the instrumentation plug-in `@opentelemetry/instrumentation-fs` is disabled. Set `DASH0_ENABLE_FS_INSTRUMENTATION=true` to enable spans for file system access.
4444

45+
46+
### <a id="DASH0_FLUSH_ON_SIGTERM_SIGINT">DASH0_FLUSH_ON_SIGTERM_SIGINT</a>
47+
48+
If `DASH0_FLUSH_ON_SIGTERM_SIGINT=true` is set, the Dash0 Node.js distribution will install a handler for SIGTERM and
49+
SIGINT that will shutdown the OpenTelemetry SDK gracefully when one of these signals is received.
50+
The SDK shutdown is timeboxed to 500 milliseconds.
51+
The signal handler will call `process.exit(0)` after the SDK's shutdown has completed, or after the 500 millisecond
52+
timeout, whichever happens sooner.
53+
This option can be helpful if you care about telemetry that is being produced shortly before the process terminates.
54+
This option must not be used if the application under monitoring has its own handler for SIGTERM or SIGINT, because
55+
Dash0's handler (and in particular the necessary `process.exit(0)` call) might interfere with the application's own
56+
signal handler.
57+
58+
### <a id="DASH0_FLUSH_ON_EMPTY_EVENT_LOOP">DASH0_FLUSH_ON_EMPTY_EVENT_LOOP</a>
59+
60+
By default, the Dash0 Node.js distribution will install a hook that will shutdown the OpenTelemetry SDK gracefully when
61+
the Node.js runtime is about to exit because the event loop is empty.
62+
This can be disabled by setting `DASH0_FLUSH_ON_EMPTY_EVENT_LOOP=false`.
63+
The SDK shutdown is timeboxed to 500 milliseconds.
64+
This hook can be helpful if you care about telemetry that is being produced shortly before the process
65+
exits.
66+
Disabling it can be useful if you care about the process terminating as quickly as possible when the event loop is
67+
empty.
68+
In contrast to the handlers for SIGTERM/SIGINT (see above), this hook will not call `process.exit` (since the Node.js
69+
runtime will exit on its own anyway).
70+
4571
### <a id="DASH0_OTEL_COLLECTOR_BASE_URL">DASH0_OTEL_COLLECTOR_BASE_URL</a>
4672

4773
The base URL of the OpenTelemetry collector that the distribution will send data to.

src/init.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ if (process.env.DASH0_DEBUG) {
2222
console.log('Dash0 OpenTelemetry distribution for Node.js: Starting NodeSDK.');
2323
}
2424

25+
let sdkShutdownHasBeenCalled = false;
26+
2527
let baseUrl = 'http://dash0-operator-opentelemetry-collector.dash0-operator-system.svc.cluster.local:4318';
2628
if (process.env.DASH0_OTEL_COLLECTOR_BASE_URL) {
2729
baseUrl = process.env.DASH0_OTEL_COLLECTOR_BASE_URL;
@@ -95,6 +97,69 @@ if (process.env.DASH0_BOOTSTRAP_SPAN != null) {
9597
.end();
9698
}
9799

100+
if (process.env.DASH0_FLUSH_ON_SIGTERM_SIGINT && process.env.DASH0_FLUSH_ON_SIGTERM_SIGINT.toLowerCase() === 'true') {
101+
['SIGTERM', 'SIGINT'].forEach(signal => {
102+
process.once(signal, onProcessExit.bind(null, true));
103+
});
104+
}
105+
106+
if (
107+
!process.env.DASH0_FLUSH_ON_EMPTY_EVENT_LOOP ||
108+
process.env.DASH0_FLUSH_ON_EMPTY_EVENT_LOOP.toLowerCase() !== 'false'
109+
) {
110+
process.once('beforeExit', onProcessExit.bind(null, false));
111+
}
112+
98113
if (process.env.DASH0_DEBUG) {
99114
console.log('Dash0 OpenTelemetry distribution for Node.js: NodeSDK started.');
100115
}
116+
117+
async function onProcessExit(callProcessExit: boolean) {
118+
await executePromiseWithTimeout(gracefulSdkShutdown(callProcessExit), 500, callProcessExit);
119+
}
120+
121+
async function gracefulSdkShutdown(callProcessExit: boolean) {
122+
try {
123+
if (sdkShutdownHasBeenCalled) {
124+
if (callProcessExit) {
125+
process.exit(0);
126+
}
127+
return;
128+
}
129+
130+
sdkShutdownHasBeenCalled = true;
131+
await sdk.shutdown();
132+
133+
if (process.env.DASH0_DEBUG) {
134+
console.log('Dash0 OpenTelemetry distribution for Node.js: OpenTelemetry SDK has been shut down successfully.');
135+
}
136+
} catch (err) {
137+
console.error('Dash0 OpenTelemetry distribution for Node.js: Error shutting down the OpenTelemetry SDK:', err);
138+
} finally {
139+
if (callProcessExit) {
140+
process.exit(0);
141+
}
142+
}
143+
}
144+
145+
function executePromiseWithTimeout(promise: Promise<any>, timeoutMillis: number, callProcessExit: boolean) {
146+
let setTimeoutId: NodeJS.Timeout;
147+
const timeoutPromise = new Promise(resolve => {
148+
setTimeoutId = setTimeout(() => {
149+
resolve(null);
150+
}, timeoutMillis);
151+
});
152+
153+
return Promise.race([
154+
//
155+
promise,
156+
timeoutPromise,
157+
]).finally(() => {
158+
if (setTimeoutId) {
159+
clearTimeout(setTimeoutId);
160+
}
161+
if (callProcessExit) {
162+
process.exit(0);
163+
}
164+
});
165+
}

test/apps/empty-event-loop/app.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-FileCopyrightText: Copyright 2024 Dash0 Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
import { sendReadyToParentProcess } from '../../util/sendToParentProcess';
5+
6+
console.log('test application started');
7+
8+
sendReadyToParentProcess();
9+
10+
process.nextTick(() => {
11+
console.log('test application will terminate');
12+
});
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"name": "dash0-app-under-test-empty-event-loop",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "app.ts",
6+
"scripts": {
7+
"test": "echo \"Error: no test specified\" && exit 1"
8+
},
9+
"author": "Bastian Krol <[email protected]>",
10+
"license": "Apache-2.0"
11+
}

test/integration/ChildProcessWrapper.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ export default class ChildProcessWrapper {
118118
}, this.options.waitForReadyRetryOptions);
119119
}
120120

121-
async stop(): Promise<void> {
121+
async stop(signal?: number | NodeJS.Signals): Promise<void> {
122122
if (!this.childProcess) {
123123
return;
124124
}
@@ -129,7 +129,11 @@ export default class ChildProcessWrapper {
129129
this.childProcess = undefined;
130130
resolve();
131131
});
132-
this.childProcess.kill();
132+
if (signal) {
133+
this.childProcess.kill(signal);
134+
} else {
135+
this.childProcess.kill();
136+
}
133137
}
134138
});
135139
}
@@ -168,10 +172,9 @@ export function defaultAppConfiguration(appPort: number): ChildProcessWrapperOpt
168172
env: {
169173
...process.env,
170174
PORT: appPort.toString(),
171-
// have the Node.js SDK send spans every 100 ms instead of every 5 seconcds to speed up tests
175+
// have the Node.js SDK send spans every 100 ms instead of every 5 seconds to speed up tests
172176
OTEL_BSP_SCHEDULE_DELAY: '100',
173177
DASH0_OTEL_COLLECTOR_BASE_URL: 'http://localhost:4318',
174-
// OTEL_LOG_LEVEL: 'VERBOSE',
175178
},
176179
};
177180
}

test/integration/test.ts

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,97 @@ describe('attach', () => {
165165
});
166166
});
167167

168+
describe('flush on exit via signal', () => {
169+
let appUnderTest: ChildProcessWrapper;
170+
171+
beforeEach(async () => {
172+
const appConfiguration = defaultAppConfiguration(appPort);
173+
appConfiguration.env!.DASH0_BOOTSTRAP_SPAN = 'Dash0 Test Bootstrap Span';
174+
appConfiguration.env!.DASH0_FLUSH_ON_SIGTERM_SIGINT = 'true';
175+
176+
// Reset interval for sending spans back to the default of 5 seconds instead of using the 100 ms the other test
177+
// cases use. This gives this test case a chance to fail without the flush-on-exit mechanism.
178+
appConfiguration.env!.OTEL_BSP_SCHEDULE_DELAY = '300000';
179+
appUnderTest = new ChildProcessWrapper(appConfiguration);
180+
});
181+
182+
afterEach(async () => {
183+
await appUnderTest.stop();
184+
});
185+
186+
it('should flush telemetry before process exit due to SIGTERM', async () => {
187+
await appUnderTest.start();
188+
await appUnderTest.stop();
189+
await waitUntil(async () => {
190+
const telemetry = await waitForTraceData();
191+
expectMatchingSpan(
192+
telemetry.traces,
193+
[
194+
resource => expectResourceAttribute(resource, 'telemetry.sdk.name', 'opentelemetry'),
195+
resource => expectResourceAttribute(resource, 'telemetry.sdk.language', 'nodejs'),
196+
resource => expectResourceAttribute(resource, 'telemetry.distro.name', 'dash0-nodejs'),
197+
resource => expectResourceAttribute(resource, 'telemetry.distro.version', expectedDistroVersion),
198+
],
199+
[span => expect(span.name).to.equal('Dash0 Test Bootstrap Span')],
200+
);
201+
});
202+
});
203+
204+
it('should flush telemetry before process exit due to SIGINT', async () => {
205+
await appUnderTest.start();
206+
await appUnderTest.stop('SIGINT');
207+
await waitUntil(async () => {
208+
const telemetry = await waitForTraceData();
209+
expectMatchingSpan(
210+
telemetry.traces,
211+
[
212+
resource => expectResourceAttribute(resource, 'telemetry.sdk.name', 'opentelemetry'),
213+
resource => expectResourceAttribute(resource, 'telemetry.sdk.language', 'nodejs'),
214+
resource => expectResourceAttribute(resource, 'telemetry.distro.name', 'dash0-nodejs'),
215+
resource => expectResourceAttribute(resource, 'telemetry.distro.version', expectedDistroVersion),
216+
],
217+
[span => expect(span.name).to.equal('Dash0 Test Bootstrap Span')],
218+
);
219+
});
220+
});
221+
});
222+
223+
describe('flush on normal process exit (empty event loop)', () => {
224+
let appUnderTest: ChildProcessWrapper;
225+
226+
beforeEach(async () => {
227+
const appConfiguration = {
228+
path: 'test/apps/empty-event-loop',
229+
label: 'app',
230+
useTsNode: true,
231+
useDistro: true,
232+
env: {
233+
...process.env,
234+
DASH0_OTEL_COLLECTOR_BASE_URL: 'http://localhost:4318',
235+
DASH0_BOOTSTRAP_SPAN: 'Dash0 Test Bootstrap Span',
236+
},
237+
};
238+
appUnderTest = new ChildProcessWrapper(appConfiguration);
239+
});
240+
241+
it('should flush telemetry before process exit due to empty event loop', async () => {
242+
await appUnderTest.start();
243+
await waitUntil(async () => {
244+
const telemetry = await waitForTraceData();
245+
expectMatchingSpan(
246+
telemetry.traces,
247+
[
248+
resource => expectResourceAttribute(resource, 'telemetry.sdk.name', 'opentelemetry'),
249+
resource => expectResourceAttribute(resource, 'telemetry.sdk.language', 'nodejs'),
250+
resource => expectResourceAttribute(resource, 'telemetry.distro.name', 'dash0-nodejs'),
251+
resource => expectResourceAttribute(resource, 'telemetry.distro.version', expectedDistroVersion),
252+
],
253+
[span => expect(span.name).to.equal('Dash0 Test Bootstrap Span')],
254+
);
255+
});
256+
});
257+
});
258+
168259
describe('print spans to file', () => {
169260
let appUnderTest: ChildProcessWrapper;
170261
const spanFilename = join(__dirname, 'spans.json');

0 commit comments

Comments
 (0)