Skip to content

Commit 4722eeb

Browse files
author
Jakub Sokół
committed
fix(otlp-exporter-base): ensure retry on network errors during HTTP export
OpenTelemetry OTLP/HTTP specification states: ``` If the server disconnects without returning a response, the client SHOULD retry and send the same request. The client SHOULD implement an exponential backoff strategy between retries to avoid overwhelming the server. ... If the client cannot connect to the server, the client SHOULD retry the connection using an exponential backoff strategy between retries. The interval between retries must have a random jitter. ``` The backoff infrastrucure was in place, it was just the glue code to request APIs (fetch, http, xhr) that was reporting non-retryable state in case of errors that might be temporary.
1 parent c071e6e commit 4722eeb

File tree

8 files changed

+284
-48
lines changed

8 files changed

+284
-48
lines changed

experimental/packages/otlp-exporter-base/src/is-export-retryable.ts

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,63 @@
1414
* limitations under the License.
1515
*/
1616

17-
export function isExportRetryable(statusCode: number): boolean {
17+
export function isExportHTTPErrorRetryable(statusCode: number): boolean {
1818
const retryCodes = [429, 502, 503, 504];
1919
return retryCodes.includes(statusCode);
2020
}
2121

22+
function getErrorCode(error: unknown): string | undefined {
23+
if (!error || typeof error !== 'object') {
24+
return undefined;
25+
}
26+
27+
if ('code' in error && typeof error.code === 'string') {
28+
return error.code;
29+
}
30+
31+
const err = error as Error;
32+
if (err.cause && typeof err.cause === 'object' && 'code' in err.cause) {
33+
const code = (err.cause as { code: unknown }).code;
34+
if (typeof code === 'string') {
35+
return code;
36+
}
37+
}
38+
39+
return undefined;
40+
}
41+
42+
export function isExportNetworkErrorRetryable(error: Error): boolean {
43+
const RETRYABLE_ERROR_CODES = new Set([
44+
'ECONNRESET',
45+
'ECONNREFUSED',
46+
'EPIPE',
47+
'ETIMEDOUT',
48+
'EAI_AGAIN',
49+
'ENOTFOUND',
50+
'ENETUNREACH',
51+
'EHOSTUNREACH',
52+
'UND_ERR_CONNECT_TIMEOUT',
53+
'UND_ERR_HEADERS_TIMEOUT',
54+
'UND_ERR_BODY_TIMEOUT',
55+
'UND_ERR_SOCKET',
56+
]);
57+
58+
if (error.name === 'AbortError') {
59+
return false;
60+
}
61+
62+
const code = getErrorCode(error);
63+
if (code && RETRYABLE_ERROR_CODES.has(code)) {
64+
return true;
65+
}
66+
67+
if (error instanceof TypeError && !error.cause) {
68+
return true;
69+
}
70+
71+
return false;
72+
}
73+
2274
export function parseRetryAfterToMills(
2375
retryAfter?: string | undefined | null
2476
): number | undefined {

experimental/packages/otlp-exporter-base/src/transport/fetch-transport.ts

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ import { IExporterTransport } from '../exporter-transport';
1818
import { ExportResponse } from '../export-response';
1919
import { diag } from '@opentelemetry/api';
2020
import {
21-
isExportRetryable,
21+
isExportHTTPErrorRetryable,
22+
isExportNetworkErrorRetryable,
2223
parseRetryAfterToMills,
2324
} from '../is-export-retryable';
2425
import { HeadersFactory } from '../configuration/otlp-http-configuration';
@@ -53,7 +54,7 @@ class FetchTransport implements IExporterTransport {
5354
if (response.status >= 200 && response.status <= 299) {
5455
diag.debug('response success');
5556
return { status: 'success' };
56-
} else if (isExportRetryable(response.status)) {
57+
} else if (isExportHTTPErrorRetryable(response.status)) {
5758
const retryAfter = response.headers.get('Retry-After');
5859
const retryInMillis = parseRetryAfterToMills(retryAfter);
5960
return { status: 'retryable', retryInMillis };
@@ -63,10 +64,10 @@ class FetchTransport implements IExporterTransport {
6364
error: new Error('Fetch request failed with non-retryable status'),
6465
};
6566
} catch (error) {
66-
if (error?.name === 'AbortError') {
67+
if (isExportNetworkErrorRetryable(error)) {
6768
return {
68-
status: 'failure',
69-
error: new Error('Fetch request timed out', { cause: error }),
69+
status: 'retryable',
70+
retryInMillis: 0,
7071
};
7172
}
7273
return {

experimental/packages/otlp-exporter-base/src/transport/http-transport-utils.ts

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ import * as zlib from 'zlib';
1919
import { Readable } from 'stream';
2020
import { ExportResponse } from '../export-response';
2121
import {
22-
isExportRetryable,
22+
isExportHTTPErrorRetryable,
23+
isExportNetworkErrorRetryable,
2324
parseRetryAfterToMills,
2425
} from '../is-export-retryable';
2526
import { OTLPExporterError } from '../types';
@@ -74,7 +75,7 @@ export function sendWithHttp(
7475
status: 'success',
7576
data: Buffer.concat(responseData),
7677
});
77-
} else if (res.statusCode && isExportRetryable(res.statusCode)) {
78+
} else if (res.statusCode && isExportHTTPErrorRetryable(res.statusCode)) {
7879
onDone({
7980
status: 'retryable',
8081
retryInMillis: parseRetryAfterToMills(res.headers['retry-after']),
@@ -96,16 +97,23 @@ export function sendWithHttp(
9697
req.setTimeout(timeoutMillis, () => {
9798
req.destroy();
9899
onDone({
99-
status: 'failure',
100-
error: new Error('Request Timeout'),
100+
status: 'retryable',
101+
retryInMillis: 0,
101102
});
102103
});
103104

104105
req.on('error', (error: Error) => {
105-
onDone({
106-
status: 'failure',
107-
error,
108-
});
106+
if (isExportNetworkErrorRetryable(error)) {
107+
onDone({
108+
status: 'retryable',
109+
retryInMillis: 0,
110+
});
111+
} else {
112+
onDone({
113+
status: 'failure',
114+
error,
115+
});
116+
}
109117
});
110118

111119
compressAndSend(req, compression, data, (error: Error) => {

experimental/packages/otlp-exporter-base/src/transport/xhr-transport.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import { IExporterTransport } from '../exporter-transport';
1818
import { ExportResponse } from '../export-response';
1919
import { diag } from '@opentelemetry/api';
2020
import {
21-
isExportRetryable,
21+
isExportHTTPErrorRetryable,
2222
parseRetryAfterToMills,
2323
} from '../is-export-retryable';
2424
// eslint-disable-next-line @typescript-eslint/no-unused-vars
@@ -49,8 +49,8 @@ class XhrTransport implements IExporterTransport {
4949

5050
xhr.ontimeout = _ => {
5151
resolve({
52-
status: 'failure',
53-
error: new Error('XHR request timed out'),
52+
status: 'retryable',
53+
retryInMillis: 0,
5454
});
5555
};
5656

@@ -60,7 +60,7 @@ class XhrTransport implements IExporterTransport {
6060
resolve({
6161
status: 'success',
6262
});
63-
} else if (xhr.status && isExportRetryable(xhr.status)) {
63+
} else if (xhr.status && isExportHTTPErrorRetryable(xhr.status)) {
6464
resolve({
6565
status: 'retryable',
6666
retryInMillis: parseRetryAfterToMills(
@@ -82,9 +82,10 @@ class XhrTransport implements IExporterTransport {
8282
});
8383
};
8484
xhr.onerror = () => {
85+
// XHR onerror typically indicates network failures which are retryable
8586
resolve({
86-
status: 'failure',
87-
error: new Error('XHR request errored'),
87+
status: 'retryable',
88+
retryInMillis: 0,
8889
});
8990
};
9091

experimental/packages/otlp-exporter-base/test/browser/fetch-transport.test.ts

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ describe('FetchTransport', function () {
122122
}, done /* catch any rejections */);
123123
});
124124

125-
it('returns failure when request times out', function (done) {
125+
it('returns failure when request is aborted', function (done) {
126126
// arrange
127127
const abortError = new Error('aborted request');
128128
abortError.name = 'AbortError';
@@ -137,7 +137,7 @@ describe('FetchTransport', function () {
137137
assert.strictEqual(response.status, 'failure');
138138
assert.strictEqual(
139139
(response as ExportResponseFailure).error.message,
140-
'Fetch request timed out'
140+
'Fetch request errored'
141141
);
142142
} catch (e) {
143143
done(e);
@@ -147,7 +147,7 @@ describe('FetchTransport', function () {
147147
clock.tick(requestTimeout + 100);
148148
});
149149

150-
it('returns failure when no server exists', function (done) {
150+
it('returns failure when fetch throws non-network error', function (done) {
151151
// arrange
152152
sinon.stub(globalThis, 'fetch').throws(new Error('fetch failed'));
153153
const clock = sinon.useFakeTimers();
@@ -169,5 +169,51 @@ describe('FetchTransport', function () {
169169
}, done /* catch any rejections */);
170170
clock.tick(requestTimeout + 100);
171171
});
172+
173+
it('returns retryable when browser fetch throws network error', function (done) {
174+
// arrange
175+
// Browser fetch throws TypeError for network errors
176+
sinon.stub(globalThis, 'fetch').rejects(new TypeError('Failed to fetch'));
177+
const transport = createFetchTransport(testTransportParameters);
178+
179+
//act
180+
transport.send(testPayload, requestTimeout).then(response => {
181+
// assert
182+
try {
183+
assert.strictEqual(response.status, 'retryable');
184+
assert.strictEqual(
185+
(response as ExportResponseRetryable).retryInMillis,
186+
0
187+
);
188+
} catch (e) {
189+
done(e);
190+
}
191+
done();
192+
}, done /* catch any rejections */);
193+
});
194+
195+
it('returns retryable when fetch throws network error with code', function (done) {
196+
// arrange
197+
const cause = new Error('network error') as NodeJS.ErrnoException;
198+
cause.code = 'ECONNRESET';
199+
const networkError = new TypeError('fetch failed', { cause });
200+
sinon.stub(globalThis, 'fetch').rejects(networkError);
201+
const transport = createFetchTransport(testTransportParameters);
202+
203+
//act
204+
transport.send(testPayload, requestTimeout).then(response => {
205+
// assert
206+
try {
207+
assert.strictEqual(response.status, 'retryable');
208+
assert.strictEqual(
209+
(response as ExportResponseRetryable).retryInMillis,
210+
0
211+
);
212+
} catch (e) {
213+
done(e);
214+
}
215+
done();
216+
}, done /* catch any rejections */);
217+
});
172218
});
173219
});

experimental/packages/otlp-exporter-base/test/browser/xhr-transport.test.ts

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ import * as assert from 'assert';
1919
import { createXhrTransport } from '../../src/transport/xhr-transport';
2020
import {
2121
ExportResponseRetryable,
22-
ExportResponseFailure,
2322
ExportResponseSuccess,
23+
ExportResponseFailure,
2424
} from '../../src';
2525
import { ensureHeadersContain } from '../testHelper';
2626

@@ -58,6 +58,14 @@ function hasOnTimeout(request: unknown): request is { ontimeout: () => void } {
5858
return 'ontimeout' in request && typeof request['ontimeout'] === 'function';
5959
}
6060

61+
function hasOnAbort(request: unknown): request is { onabort: () => void } {
62+
if (request == null || typeof request != 'object') {
63+
return false;
64+
}
65+
66+
return 'onabort' in request && typeof request['onabort'] === 'function';
67+
}
68+
6169
describe('XhrTransport', function () {
6270
afterEach(function () {
6371
sinon.restore();
@@ -151,7 +159,7 @@ describe('XhrTransport', function () {
151159
}, done /* catch any rejections */);
152160
});
153161

154-
it('returns failure when request times out', function (done) {
162+
it('returns retryable when request times out', function (done) {
155163
// arrange
156164
// A fake server needed, otherwise the message will not be a timeout but a failure to connect.
157165
const server = sinon.useFakeServer();
@@ -170,10 +178,10 @@ describe('XhrTransport', function () {
170178
transport.send(testPayload, requestTimeout).then(response => {
171179
// assert
172180
try {
173-
assert.strictEqual(response.status, 'failure');
181+
assert.strictEqual(response.status, 'retryable');
174182
assert.strictEqual(
175-
(response as ExportResponseFailure).error.message,
176-
'XHR request timed out'
183+
(response as ExportResponseRetryable).retryInMillis,
184+
0
177185
);
178186
} catch (e) {
179187
done(e);
@@ -182,26 +190,56 @@ describe('XhrTransport', function () {
182190
}, done /* catch any rejections */);
183191
});
184192

185-
it('returns failure when no server exists', function (done) {
193+
it('returns retryable when network error occurs', function (done) {
186194
// arrange
187195
const clock = sinon.useFakeTimers();
188196
const transport = createXhrTransport(testTransportParameters);
189197

198+
//act
199+
transport.send(testPayload, requestTimeout).then(response => {
200+
// assert
201+
try {
202+
assert.strictEqual(response.status, 'retryable');
203+
assert.strictEqual(
204+
(response as ExportResponseRetryable).retryInMillis,
205+
0
206+
);
207+
} catch (e) {
208+
done(e);
209+
}
210+
done();
211+
}, done /* catch any rejections */);
212+
clock.tick(requestTimeout + 100);
213+
});
214+
215+
it('returns failure when request is aborted', function (done) {
216+
// arrange
217+
const server = sinon.useFakeServer();
218+
const transport = createXhrTransport(testTransportParameters);
219+
220+
respondWhenRequestExists(server, () => {
221+
// this executes after the act block
222+
const request = server.requests[0];
223+
// Manually trigger the onabort event
224+
if (hasOnAbort(request)) {
225+
request.onabort();
226+
}
227+
});
228+
190229
//act
191230
transport.send(testPayload, requestTimeout).then(response => {
192231
// assert
193232
try {
194233
assert.strictEqual(response.status, 'failure');
195234
assert.strictEqual(
196235
(response as ExportResponseFailure).error.message,
197-
'XHR request errored'
236+
'XHR request aborted'
198237
);
199238
} catch (e) {
200239
done(e);
201240
}
202241
done();
203242
}, done /* catch any rejections */);
204-
clock.tick(requestTimeout + 100);
205243
});
206244
});
207245
});

0 commit comments

Comments
 (0)