Skip to content

Commit 1027479

Browse files
authored
Updating retry logic to use attempts for backoffs and fixing flaky te… (#342)
1 parent c9472a7 commit 1027479

File tree

8 files changed

+203
-295
lines changed

8 files changed

+203
-295
lines changed

assets/featureFlag/alpha.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,4 @@
4646
]
4747
}
4848
}
49-
}
49+
}

assets/featureFlag/beta.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,4 @@
4545
]
4646
}
4747
}
48-
}
48+
}

assets/featureFlag/prod.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@
4141
]
4242
}
4343
}
44-
}
44+
}

src/stacks/actions/StackActionOperations.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ export async function cleanupReviewStack(
274274
maxRetries: 3,
275275
initialDelayMs: 1000,
276276
operationName: `Delete stack ${workflow.stackName}`,
277+
totalTimeoutMs: 30_000,
277278
},
278279
logger,
279280
);
@@ -298,6 +299,7 @@ export async function deleteChangeSet(
298299
maxRetries: 3,
299300
initialDelayMs: 1000,
300301
operationName: `Delete change set ${workflow.changeSetName}`,
302+
totalTimeoutMs: 30_000,
301303
},
302304
logger,
303305
);

src/utils/Retry.ts

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import { Logger } from 'pino';
2+
import { extractErrorMessage } from './Errors';
23

34
export type RetryOptions = {
45
maxRetries?: number;
56
initialDelayMs?: number;
67
maxDelayMs?: number;
78
backoffMultiplier?: number;
89
jitterFactor?: number;
9-
operationName?: string;
10-
totalTimeoutMs?: number;
10+
operationName: string;
11+
totalTimeoutMs: number;
1112
};
1213

1314
function sleep(ms: number): Promise<void> {
@@ -16,65 +17,79 @@ function sleep(ms: number): Promise<void> {
1617
});
1718
}
1819

19-
function checkTotalTimeout(startTime: number, totalTimeoutMs: number | undefined, operationName: string): void {
20-
if (totalTimeoutMs !== undefined && Date.now() - startTime > totalTimeoutMs) {
21-
throw new Error(`${operationName} timed out after ${totalTimeoutMs}ms`);
22-
}
23-
}
24-
25-
function calculateNextDelay(
26-
currentDelay: number,
20+
function calculateDelay(
21+
attempt: number,
22+
initialDelayMs: number,
2723
jitterFactor: number,
2824
backoffMultiplier: number,
2925
maxDelayMs: number,
3026
): number {
31-
const nextDelay = currentDelay * backoffMultiplier;
32-
const jitter = jitterFactor > 0 ? Math.random() * jitterFactor * nextDelay : 0;
33-
return Math.min(nextDelay + jitter, maxDelayMs);
27+
// 1. Exponential Backoff: initial * 2^0, initial * 2^1, etc.
28+
const exponentialDelay = initialDelayMs * Math.pow(backoffMultiplier, attempt);
29+
30+
// 2. Cap at Max Delay
31+
const cappedDelay = Math.min(exponentialDelay, maxDelayMs);
32+
33+
// 3. Add Jitter (randomized percentage of the current delay), capped at maxDelayMs
34+
const jitter = jitterFactor > 0 ? Math.random() * jitterFactor * cappedDelay : 0;
35+
36+
return Math.min(cappedDelay + jitter, maxDelayMs);
3437
}
3538

36-
/**
37-
* Retry a function with exponential backoff
38-
*/
3939
export async function retryWithExponentialBackoff<T>(
4040
fn: () => Promise<T>,
4141
options: RetryOptions,
4242
log: Logger,
43+
sleepFn: (ms: number) => Promise<void> = (ms: number) => {
44+
return sleep(ms);
45+
},
4346
): Promise<T> {
44-
const maxRetries = options.maxRetries ?? 3;
45-
const initialDelayMs = options.initialDelayMs ?? 1000;
46-
const maxDelayMs = options.maxDelayMs ?? 30_000;
47-
const backoffMultiplier = options.backoffMultiplier ?? 2;
48-
const jitterFactor = options.jitterFactor ?? 0;
49-
const operationName = options.operationName ?? 'Operation';
50-
const totalTimeoutMs = options.totalTimeoutMs;
47+
const {
48+
maxRetries = 3,
49+
initialDelayMs = 1000,
50+
maxDelayMs = 5000,
51+
backoffMultiplier = 2,
52+
jitterFactor = 0.1,
53+
operationName,
54+
totalTimeoutMs,
55+
} = options;
5156

52-
let lastError: Error = new Error('No attempts made');
53-
let currentDelay = initialDelayMs;
54-
const startTime = Date.now();
57+
if (backoffMultiplier < 1) {
58+
throw new Error('Backoff multiplier must be greater than or equal to 1');
59+
}
60+
61+
if (totalTimeoutMs <= 0) {
62+
throw new Error('Total timeout must be greater than 0');
63+
}
5564

56-
for (let attempt = 0; attempt <= maxRetries; attempt++) {
57-
checkTotalTimeout(startTime, totalTimeoutMs, operationName);
65+
let lastError: Error | undefined = undefined;
66+
const startTime = performance.now();
67+
68+
// If maxRetries is 3, we run: 0 (initial), 1, 2, 3. Total 4 attempts.
69+
const attempts = maxRetries + 1;
70+
for (let attemptIdx = 0; attemptIdx < attempts; attemptIdx++) {
71+
if (attemptIdx > 0 && performance.now() - startTime >= totalTimeoutMs) {
72+
const message = `${operationName} timed out after ${performance.now() - startTime}ms, on attempt #${attemptIdx + 1}/${attempts}`;
73+
const errorMsg = lastError ? `${message}. Last error: ${lastError.message}` : message;
74+
throw new Error(errorMsg);
75+
}
5876

5977
try {
6078
return await fn();
6179
} catch (error) {
62-
lastError = error instanceof Error ? error : new Error(String(error));
63-
64-
if (attempt === maxRetries) {
65-
throw new Error(
66-
`${operationName} failed after ${maxRetries + 1} attempts. Last error: ${lastError.message}`,
67-
);
80+
lastError = error instanceof Error ? error : new Error(extractErrorMessage(error));
81+
if (attemptIdx === attempts - 1) {
82+
throw new Error(`${operationName} failed after ${attempts} attempts. Last error: ${lastError.message}`);
6883
}
6984

85+
const delay = calculateDelay(attemptIdx, initialDelayMs, jitterFactor, backoffMultiplier, maxDelayMs);
7086
log.warn(
71-
`${operationName} attempt ${attempt + 1} failed: ${lastError.message}. Retrying in ${currentDelay}ms...`,
87+
`${operationName} attempt ${attemptIdx + 1} failed: ${lastError.message}. Retrying in ${Math.round(delay)}ms...`,
7288
);
7389

74-
await sleep(currentDelay);
75-
currentDelay = calculateNextDelay(currentDelay, jitterFactor, backoffMultiplier, maxDelayMs);
90+
await sleepFn(delay);
7691
}
7792
}
7893

79-
throw lastError;
94+
throw new Error('Something went wrong, this is not reachable');
8095
}

tst/unit/services/cfnLint/PyodideWorkerManager.test.ts

Lines changed: 1 addition & 188 deletions
Original file line numberDiff line numberDiff line change
@@ -645,35 +645,6 @@ describe('PyodideWorkerManager', () => {
645645
});
646646

647647
describe('retry logic', () => {
648-
test('should not retry when maxRetries is 0', async () => {
649-
// Create a worker manager with retries disabled
650-
const noRetryWorkerManager = new PyodideWorkerManager(
651-
{
652-
maxRetries: 0,
653-
initialDelayMs: 10,
654-
maxDelayMs: 100,
655-
backoffMultiplier: 2,
656-
totalTimeoutMs: 5000,
657-
},
658-
createDefaultCfnLintSettings(),
659-
660-
mockLogging,
661-
);
662-
663-
// Mock worker creation to fail
664-
workerConstructor.mockImplementation(() => {
665-
throw new Error('Worker creation failed');
666-
});
667-
668-
// Expect initialization to fail immediately without retries
669-
await expect(noRetryWorkerManager.initialize()).rejects.toThrow(
670-
'Pyodide initialization failed after 1 attempts. Last error: Worker creation failed',
671-
);
672-
673-
// Verify no retry attempts were logged
674-
expect(mockLogging.warn.callCount).toBe(0); // No retry warnings
675-
});
676-
677648
test('should retry initialization on failure and eventually succeed', async () => {
678649
// Create a worker manager with retry enabled
679650
const retryWorkerManager = new PyodideWorkerManager(
@@ -954,9 +925,7 @@ describe('PyodideWorkerManager', () => {
954925
});
955926

956927
// Expect initialization to fail with timeout error
957-
await expect(retryWorkerManager.initialize()).rejects.toThrow(
958-
'Pyodide initialization timed out after 300ms',
959-
);
928+
await expect(retryWorkerManager.initialize()).rejects.toThrow(/Pyodide initialization timed out after 3/);
960929

961930
const totalTime = Date.now() - startTime;
962931

@@ -968,162 +937,6 @@ describe('PyodideWorkerManager', () => {
968937
expect(attemptCount).toBeGreaterThan(1);
969938
expect(attemptCount).toBeLessThan(11); // Should timeout before reaching max retries
970939
});
971-
972-
test('should work without totalTimeoutMs (backward compatibility)', async () => {
973-
// Create a worker manager without totalTimeoutMs (uses default from DefaultSettings)
974-
const retryWorkerManager = new PyodideWorkerManager(
975-
{
976-
maxRetries: 2,
977-
initialDelayMs: 10,
978-
maxDelayMs: 50,
979-
backoffMultiplier: 2,
980-
totalTimeoutMs: 120_000, // Uses default value from DefaultSettings
981-
},
982-
createDefaultCfnLintSettings(),
983-
984-
mockLogging,
985-
);
986-
987-
let attemptCount = 0;
988-
989-
workerConstructor.mockImplementation(() => {
990-
attemptCount++;
991-
throw new Error('Worker creation failed');
992-
});
993-
994-
// Should fail after max retries, not timeout (since timeout is very large)
995-
await expect(retryWorkerManager.initialize()).rejects.toThrow(
996-
'Pyodide initialization failed after 3 attempts. Last error: Worker creation failed',
997-
);
998-
999-
// Should have made all retry attempts (initial + 2 retries = 3 total)
1000-
expect(attemptCount).toBe(3);
1001-
});
1002-
1003-
test('should prefer explicit totalTimeoutMs over calculated timeout', async () => {
1004-
// Create a worker manager where explicit totalTimeoutMs is much shorter than calculated timeout
1005-
const retryWorkerManager = new PyodideWorkerManager(
1006-
{
1007-
maxRetries: 5, // Would normally allow 6 attempts
1008-
initialDelayMs: 100,
1009-
maxDelayMs: 1000, // Calculated timeout would be 1000 * 6 = 6000ms
1010-
backoffMultiplier: 2,
1011-
totalTimeoutMs: 200, // But explicit timeout is only 200ms
1012-
},
1013-
createDefaultCfnLintSettings(),
1014-
1015-
mockLogging,
1016-
);
1017-
1018-
const startTime = Date.now();
1019-
let attemptCount = 0;
1020-
1021-
workerConstructor.mockImplementation(() => {
1022-
attemptCount++;
1023-
throw new Error('Worker creation failed');
1024-
});
1025-
1026-
// Should timeout quickly due to explicit totalTimeoutMs
1027-
await expect(retryWorkerManager.initialize()).rejects.toThrow(
1028-
'Pyodide initialization timed out after 200ms',
1029-
);
1030-
1031-
const totalTime = Date.now() - startTime;
1032-
1033-
// Should respect the explicit 200ms timeout, not the calculated 6000ms
1034-
// Allow more variance for CI environments
1035-
expect(totalTime).toBeGreaterThanOrEqual(180);
1036-
expect(totalTime).toBeLessThan(350);
1037-
1038-
// Should have made fewer attempts due to quick timeout
1039-
expect(attemptCount).toBeLessThan(6); // Should not reach max retries
1040-
});
1041-
1042-
test('should handle zero totalTimeoutMs (immediate timeout)', async () => {
1043-
// Create a worker manager with zero totalTimeoutMs
1044-
const retryWorkerManager = new PyodideWorkerManager(
1045-
{
1046-
maxRetries: 3,
1047-
initialDelayMs: 100,
1048-
maxDelayMs: 500,
1049-
backoffMultiplier: 2,
1050-
totalTimeoutMs: 0, // Immediate timeout
1051-
},
1052-
createDefaultCfnLintSettings(),
1053-
1054-
mockLogging,
1055-
);
1056-
1057-
let attemptCount = 0;
1058-
1059-
workerConstructor.mockImplementation(() => {
1060-
attemptCount++;
1061-
throw new Error('Worker creation failed');
1062-
});
1063-
1064-
// Should timeout immediately after first attempt
1065-
await expect(retryWorkerManager.initialize()).rejects.toThrow('Pyodide initialization timed out after 0ms');
1066-
1067-
// Should have made at least one attempt before timing out
1068-
expect(attemptCount).toBe(1);
1069-
});
1070-
1071-
test('should handle very large totalTimeoutMs', async () => {
1072-
// Create a worker manager with very large totalTimeoutMs
1073-
const retryWorkerManager = new PyodideWorkerManager(
1074-
{
1075-
maxRetries: 2,
1076-
initialDelayMs: 10,
1077-
maxDelayMs: 50,
1078-
backoffMultiplier: 2,
1079-
totalTimeoutMs: 10000, // Very large timeout (10 seconds)
1080-
},
1081-
createDefaultCfnLintSettings(),
1082-
1083-
mockLogging,
1084-
);
1085-
1086-
let attemptCount = 0;
1087-
1088-
workerConstructor.mockImplementation(() => {
1089-
attemptCount++;
1090-
throw new Error('Worker creation failed');
1091-
});
1092-
1093-
// Should fail due to max retries, not timeout
1094-
await expect(retryWorkerManager.initialize()).rejects.toThrow(
1095-
'Pyodide initialization failed after 3 attempts. Last error: Worker creation failed',
1096-
);
1097-
1098-
// Should have made all retry attempts
1099-
expect(attemptCount).toBe(3);
1100-
});
1101-
1102-
test('should use default operation name in error messages', async () => {
1103-
// Create a worker manager without specifying operation name
1104-
const retryWorkerManager = new PyodideWorkerManager(
1105-
{
1106-
maxRetries: 1,
1107-
initialDelayMs: 10,
1108-
maxDelayMs: 100,
1109-
backoffMultiplier: 2,
1110-
totalTimeoutMs: 5000,
1111-
},
1112-
createDefaultCfnLintSettings(),
1113-
1114-
mockLogging,
1115-
);
1116-
1117-
// Always fail worker creation
1118-
workerConstructor.mockImplementation(() => {
1119-
throw new Error('Worker creation failed');
1120-
});
1121-
1122-
// Expect initialization to fail with default operation name
1123-
await expect(retryWorkerManager.initialize()).rejects.toThrow(
1124-
'Pyodide initialization failed after 2 attempts. Last error: Worker creation failed',
1125-
);
1126-
});
1127940
});
1128941

1129942
describe('race conditions and edge cases', () => {

tst/unit/stackActions/StackActionWorkflowOperations.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ describe('StackActionWorkflowOperations', () => {
265265
maxRetries: 3,
266266
initialDelayMs: 1000,
267267
operationName: 'Delete stack test-stack',
268+
totalTimeoutMs: 30000,
268269
},
269270
expect.any(Object), // logger
270271
);
@@ -293,6 +294,7 @@ describe('StackActionWorkflowOperations', () => {
293294
maxRetries: 3,
294295
initialDelayMs: 1000,
295296
operationName: 'Delete change set changeset-123',
297+
totalTimeoutMs: 30000,
296298
},
297299
expect.any(Object), // logger
298300
);

0 commit comments

Comments
 (0)