Skip to content

Commit 23fbd0a

Browse files
authored
Use backoff strategy across projects and in between region attempts (#1715)
1 parent a9a08d4 commit 23fbd0a

File tree

7 files changed

+299
-16
lines changed

7 files changed

+299
-16
lines changed

.changeset/clean-forks-prove.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"livekit-client": patch
3+
---
4+
5+
Add exponential backoff strategy in case of connection failures

src/room/BackOffStrategy.test.ts

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import { beforeEach, describe, expect, it, vi } from 'vitest';
2+
import { BackOffStrategy } from './BackOffStrategy';
3+
import * as utils from './utils';
4+
5+
vi.mock('./utils', async () => {
6+
const actual = await vi.importActual('./utils');
7+
return {
8+
...actual,
9+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
10+
sleep: vi.fn((ms: number) => Promise.resolve()),
11+
extractProjectFromUrl: vi.fn((url: URL) => {
12+
// @ts-ignore
13+
return actual.extractProjectFromUrl(url);
14+
}),
15+
};
16+
});
17+
18+
describe('BackOffStrategy', () => {
19+
beforeEach(() => {
20+
// Reset singleton and mocks before each test
21+
(BackOffStrategy as any)._instance = null;
22+
vi.clearAllMocks();
23+
});
24+
25+
it('should return the same singleton instance', () => {
26+
const instance1 = BackOffStrategy.getInstance();
27+
const instance2 = BackOffStrategy.getInstance();
28+
expect(instance1).toBe(instance2);
29+
});
30+
31+
it('should not add failed attempts for self-hosted URLs', () => {
32+
const strategy = BackOffStrategy.getInstance();
33+
const selfHostedUrl = 'wss://my-server.com';
34+
35+
strategy.addFailedConnectionAttempt(selfHostedUrl);
36+
37+
// Verify extractProjectFromUrl was called and returned null
38+
expect(utils.extractProjectFromUrl).toHaveBeenCalledWith(new URL(selfHostedUrl));
39+
// Verify sleep was not called since no project name exists
40+
expect(utils.sleep).not.toHaveBeenCalled();
41+
});
42+
43+
it('should apply exponential backoff for cloud URLs', () => {
44+
const strategy = BackOffStrategy.getInstance();
45+
const cloudUrl = 'wss://myproject.livekit.cloud';
46+
47+
// First failure: 500ms
48+
strategy.addFailedConnectionAttempt(cloudUrl);
49+
expect(utils.sleep).toHaveBeenCalledWith(500);
50+
51+
// Second failure: 1000ms
52+
strategy.addFailedConnectionAttempt(cloudUrl);
53+
expect(utils.sleep).toHaveBeenCalledWith(1000);
54+
55+
// Third failure: 2000ms
56+
strategy.addFailedConnectionAttempt(cloudUrl);
57+
expect(utils.sleep).toHaveBeenCalledWith(2000);
58+
59+
// Fourth failure: 4000ms
60+
strategy.addFailedConnectionAttempt(cloudUrl);
61+
expect(utils.sleep).toHaveBeenCalledWith(4000);
62+
});
63+
64+
it('should cap backoff at maximum delay', () => {
65+
const strategy = BackOffStrategy.getInstance();
66+
const cloudUrl = 'wss://myproject.livekit.cloud';
67+
const maxBackoff = 15_000;
68+
69+
// Simulate many failures to reach max backoff
70+
for (let i = 0; i < 10; i++) {
71+
strategy.addFailedConnectionAttempt(cloudUrl);
72+
}
73+
74+
// Last call should be capped at 15000ms
75+
const lastCall = (utils.sleep as any).mock.calls[(utils.sleep as any).mock.calls.length - 1];
76+
expect(lastCall[0]).toBeLessThanOrEqual(maxBackoff);
77+
});
78+
79+
it('should reset failed attempts for a specific project', () => {
80+
const strategy = BackOffStrategy.getInstance();
81+
const cloudUrl = 'wss://myproject.livekit.cloud';
82+
83+
// Add multiple failures
84+
strategy.addFailedConnectionAttempt(cloudUrl);
85+
strategy.addFailedConnectionAttempt(cloudUrl);
86+
strategy.addFailedConnectionAttempt(cloudUrl);
87+
88+
// Reset the project
89+
strategy.resetFailedConnectionAttempts(cloudUrl);
90+
91+
// Next failure should start from base delay again
92+
vi.clearAllMocks();
93+
strategy.addFailedConnectionAttempt(cloudUrl);
94+
expect(utils.sleep).toHaveBeenCalledWith(500);
95+
});
96+
97+
it('should isolate backoff state between different projects', () => {
98+
const strategy = BackOffStrategy.getInstance();
99+
const project1Url = 'wss://project1.livekit.cloud';
100+
const project2Url = 'wss://project2.livekit.cloud';
101+
102+
// Add failures to project1
103+
strategy.addFailedConnectionAttempt(project1Url);
104+
strategy.addFailedConnectionAttempt(project1Url);
105+
106+
// Add failures to project2
107+
strategy.addFailedConnectionAttempt(project2Url);
108+
109+
// Verify project2 starts at base delay despite project1 having multiple failures
110+
const calls = (utils.sleep as any).mock.calls;
111+
expect(calls[0][0]).toBe(500); // project1 first failure
112+
expect(calls[1][0]).toBe(1000); // project1 second failure
113+
expect(calls[2][0]).toBe(500); // project2 first failure (independent)
114+
});
115+
116+
it('should return correct backoff promise', async () => {
117+
const strategy = BackOffStrategy.getInstance();
118+
const cloudUrl = 'wss://myproject.livekit.cloud';
119+
const selfHostedUrl = 'wss://my-server.com';
120+
121+
// Add a failure to create a backoff promise
122+
strategy.addFailedConnectionAttempt(cloudUrl);
123+
const backoffPromise = strategy.getBackOffPromise(cloudUrl);
124+
125+
// Should return a promise for cloud URLs with failures
126+
expect(backoffPromise).toBeInstanceOf(Promise);
127+
128+
// Should return resolved promise for self-hosted URLs
129+
const selfHostedPromise = strategy.getBackOffPromise(selfHostedUrl);
130+
expect(selfHostedPromise).toBeInstanceOf(Promise);
131+
await expect(selfHostedPromise).resolves.toBeUndefined();
132+
});
133+
134+
it('should clear all state with resetAll', () => {
135+
const strategy = BackOffStrategy.getInstance();
136+
const project1Url = 'wss://project1.livekit.cloud';
137+
const project2Url = 'wss://project2.livekit.cloud';
138+
139+
// Add failures to multiple projects
140+
strategy.addFailedConnectionAttempt(project1Url);
141+
strategy.addFailedConnectionAttempt(project1Url);
142+
strategy.addFailedConnectionAttempt(project2Url);
143+
144+
// Reset all state
145+
strategy.resetAll();
146+
147+
// Next failures should start from base delay for all projects
148+
vi.clearAllMocks();
149+
strategy.addFailedConnectionAttempt(project1Url);
150+
strategy.addFailedConnectionAttempt(project2Url);
151+
152+
const calls = (utils.sleep as any).mock.calls;
153+
expect(calls[0][0]).toBe(500); // project1 back to base
154+
expect(calls[1][0]).toBe(500); // project2 back to base
155+
});
156+
});

src/room/BackOffStrategy.ts

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import { extractProjectFromUrl, sleep } from './utils';
2+
3+
const CONNECTION_BACKOFF_MIN_MS = 500;
4+
const CONNECTION_BACKOFF_MAX_MS = 15_000;
5+
6+
/**
7+
* BackOffStrategy implements exponential backoff for connection failures.
8+
*
9+
* When severe connection failures occur (e.g., network issues, server unavailability),
10+
* this strategy introduces increasing delays between reconnection attempts to avoid
11+
* overwhelming the server and to give transient issues time to resolve.
12+
*
13+
* This strategy is only applied to LiveKit Cloud projects. It identifies
14+
* projects by extracting the project name from the connection URL and tracks failures
15+
* per project. Self-hosted deployments (URLs without a project identifier) are not
16+
* subject to backoff delays.
17+
*
18+
* The class is implemented as a singleton to maintain consistent backoff state across
19+
* the entire application lifecycle instead of room instance lifecycle.
20+
*/
21+
export class BackOffStrategy {
22+
private static _instance: BackOffStrategy | null = null;
23+
24+
private failedConnectionAttempts = new Map<string, number>();
25+
26+
private backOffPromises = new Map<string, Promise<void>>();
27+
28+
// eslint-disable-next-line @typescript-eslint/no-empty-function
29+
private constructor() {}
30+
31+
static getInstance(): BackOffStrategy {
32+
if (!this._instance) {
33+
this._instance = new BackOffStrategy();
34+
}
35+
return this._instance;
36+
}
37+
38+
addFailedConnectionAttempt(urlString: string) {
39+
const url = new URL(urlString);
40+
const projectName = extractProjectFromUrl(url);
41+
if (!projectName) {
42+
return;
43+
}
44+
let failureCount = this.failedConnectionAttempts.get(projectName) ?? 0;
45+
this.failedConnectionAttempts.set(projectName, failureCount + 1);
46+
this.backOffPromises.set(
47+
projectName,
48+
sleep(
49+
Math.min(CONNECTION_BACKOFF_MIN_MS * Math.pow(2, failureCount), CONNECTION_BACKOFF_MAX_MS),
50+
),
51+
);
52+
}
53+
54+
getBackOffPromise(urlString: string): Promise<void> {
55+
const url = new URL(urlString);
56+
const projectName = url && extractProjectFromUrl(url);
57+
const backoffPromise = projectName && this.backOffPromises.get(projectName);
58+
return backoffPromise || Promise.resolve();
59+
}
60+
61+
resetFailedConnectionAttempts(urlString: string) {
62+
const url = new URL(urlString);
63+
const projectName = url && extractProjectFromUrl(url);
64+
if (projectName) {
65+
this.failedConnectionAttempts.set(projectName, 0);
66+
this.backOffPromises.set(projectName, Promise.resolve());
67+
}
68+
}
69+
70+
resetAll() {
71+
this.backOffPromises.clear();
72+
this.failedConnectionAttempts.clear();
73+
}
74+
}

src/room/RegionUrlProvider.test.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,21 @@ describe('RegionUrlProvider', () => {
714714
await expect(provider.fetchRegionSettings()).rejects.toThrow('Network timeout');
715715
});
716716

717+
it('wraps fetch throw as ConnectionError with ServerUnreachable reason', async () => {
718+
const provider = new RegionUrlProvider('wss://test.livekit.cloud', 'token');
719+
720+
// Simulate fetch itself throwing an error (common in network failures)
721+
fetchMock.mockRejectedValue(new TypeError('Failed to fetch'));
722+
723+
// Should throw a ConnectionError that can be handled
724+
const error = await provider.fetchRegionSettings().catch((e) => e);
725+
726+
expect(error).toBeInstanceOf(ConnectionError);
727+
expect(error.reason).toBe(ConnectionErrorReason.ServerUnreachable);
728+
expect(error.status).toBe(500);
729+
expect(error.message).toContain('Failed to fetch');
730+
});
731+
717732
it('handles concurrent getNextBestRegionUrl calls', async () => {
718733
const provider = new RegionUrlProvider('wss://test.livekit.cloud', 'token');
719734
const mockSettings = createMockRegionSettings([

src/room/RegionUrlProvider.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ export class RegionUrlProvider {
4545
regionSettingsResponse.status,
4646
);
4747
}
48+
} catch (e: unknown) {
49+
if (e instanceof ConnectionError) {
50+
// rethrow connection errors
51+
throw e;
52+
} else {
53+
// wrap other errors as connection errors (e.g. timeouts)
54+
throw new ConnectionError(
55+
`Could not fetch region settings, ${e instanceof Error ? `${e.name}: ${e.message}` : e}`,
56+
ConnectionErrorReason.ServerUnreachable,
57+
500, // using 500 as a catch-all manually set error code here
58+
);
59+
}
4860
} finally {
4961
unlock();
5062
}

src/room/Room.ts

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import type {
4343
RoomOptions,
4444
} from '../options';
4545
import { getBrowser } from '../utils/browserParser';
46+
import { BackOffStrategy } from './BackOffStrategy';
4647
import DeviceManager from './DeviceManager';
4748
import RTCEngine from './RTCEngine';
4849
import { RegionUrlProvider } from './RegionUrlProvider';
@@ -115,7 +116,7 @@ export enum ConnectionState {
115116
SignalReconnecting = 'signalReconnecting',
116117
}
117118

118-
const connectionReconcileFrequency = 4 * 1000;
119+
const CONNECTION_RECONCILE_FREQUENCY_MS = 4 * 1000;
119120

120121
/**
121122
* In LiveKit, a room is the logical grouping for a list of participants.
@@ -680,52 +681,64 @@ class Room extends (EventEmitter as new () => TypedEmitter<RoomEventCallbacks>)
680681
unlockDisconnect?.();
681682

682683
try {
684+
await BackOffStrategy.getInstance().getBackOffPromise(url);
683685
await this.attemptConnection(regionUrl ?? url, token, opts, abortController);
684686
this.abortController = undefined;
685687
resolve();
686-
} catch (e) {
688+
} catch (error) {
687689
if (
688690
this.regionUrlProvider &&
689-
e instanceof ConnectionError &&
690-
e.reason !== ConnectionErrorReason.Cancelled &&
691-
e.reason !== ConnectionErrorReason.NotAllowed
691+
error instanceof ConnectionError &&
692+
error.reason !== ConnectionErrorReason.Cancelled &&
693+
error.reason !== ConnectionErrorReason.NotAllowed
692694
) {
693695
let nextUrl: string | null = null;
694696
try {
695697
nextUrl = await this.regionUrlProvider.getNextBestRegionUrl(
696698
this.abortController?.signal,
697699
);
698-
} catch (error) {
700+
} catch (regionFetchError) {
699701
if (
700-
error instanceof ConnectionError &&
701-
(error.status === 401 || error.reason === ConnectionErrorReason.Cancelled)
702+
regionFetchError instanceof ConnectionError &&
703+
(regionFetchError.status === 401 ||
704+
regionFetchError.reason === ConnectionErrorReason.Cancelled)
702705
) {
703706
this.handleDisconnect(this.options.stopLocalTrackOnUnpublish);
704-
reject(error);
707+
reject(regionFetchError);
705708
return;
706709
}
707710
}
711+
if (
712+
// making sure we only register failed attempts on things we actually care about
713+
[
714+
ConnectionErrorReason.InternalError,
715+
ConnectionErrorReason.ServerUnreachable,
716+
ConnectionErrorReason.Timeout,
717+
].includes(error.reason)
718+
) {
719+
BackOffStrategy.getInstance().addFailedConnectionAttempt(url);
720+
}
708721
if (nextUrl && !this.abortController?.signal.aborted) {
709722
this.log.info(
710-
`Initial connection failed with ConnectionError: ${e.message}. Retrying with another region: ${nextUrl}`,
723+
`Initial connection failed with ConnectionError: ${error.message}. Retrying with another region: ${nextUrl}`,
711724
this.logContext,
712725
);
713726
this.recreateEngine();
714727
await connectFn(resolve, reject, nextUrl);
715728
} else {
716729
this.handleDisconnect(
717730
this.options.stopLocalTrackOnUnpublish,
718-
getDisconnectReasonFromConnectionError(e),
731+
getDisconnectReasonFromConnectionError(error),
719732
);
720-
reject(e);
733+
reject(error);
721734
}
722735
} else {
723736
let disconnectReason = DisconnectReason.UNKNOWN_REASON;
724-
if (e instanceof ConnectionError) {
725-
disconnectReason = getDisconnectReasonFromConnectionError(e);
737+
if (error instanceof ConnectionError) {
738+
disconnectReason = getDisconnectReasonFromConnectionError(error);
726739
}
727740
this.handleDisconnect(this.options.stopLocalTrackOnUnpublish, disconnectReason);
728-
reject(e);
741+
reject(error);
729742
}
730743
}
731744
};
@@ -919,6 +932,7 @@ class Room extends (EventEmitter as new () => TypedEmitter<RoomEventCallbacks>)
919932
}
920933
this.setAndEmitConnectionState(ConnectionState.Connected);
921934
this.emit(RoomEvent.Connected);
935+
BackOffStrategy.getInstance().resetFailedConnectionAttempts(url);
922936
this.registerConnectionReconcile();
923937
};
924938

@@ -2280,7 +2294,7 @@ class Room extends (EventEmitter as new () => TypedEmitter<RoomEventCallbacks>)
22802294
} else {
22812295
consecutiveFailures = 0;
22822296
}
2283-
}, connectionReconcileFrequency);
2297+
}, CONNECTION_RECONCILE_FREQUENCY_MS);
22842298
}
22852299

22862300
private clearConnectionReconcile() {

0 commit comments

Comments
 (0)