Skip to content

Commit b57a3fb

Browse files
feat: add retry mechanism for flaky cli integ tests (#753)
We have observed frequent pipeline build failures due to flaky integration tests that usually succeed on retries. Currently, these tests have to be retried manually for unblocking the pipelines. By enabling automated retries for these flaky tests, we can reduce intermittent pipeline failures and eliminate unnecessary manual intervention which then will only be required for test failures that persist even after retries. The change adds a retry wrapper for specific CLI integration tests that have been identified as frequently failing but succeeding on retries. The goal is to identify the the frequently failing flaky test that require manual intervention and add retry mechanism before we investigate the tests individually. Fixes : cdklabs/cdk-ops#4176 ### Top failing tests from last week <img width="1303" height="207" alt="image" src="https://github.com/user-attachments/assets/9911effe-a422-4327-a602-bce2b0c77121" /> --- By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license
1 parent c6e6c46 commit b57a3fb

File tree

2 files changed

+33
-3
lines changed

2 files changed

+33
-3
lines changed

packages/@aws-cdk-testing/cli-integ/lib/with-cdk-app.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,36 @@ export function withCDKMigrateFixture(language: string, block: (content: TestFix
163163
return withAws(withTimeout(DEFAULT_TEST_TIMEOUT_S, withCdkMigrateApp(language, block)));
164164
}
165165

166+
/**
167+
* Retry wrapper that executes a test callback up to maxAttempts times
168+
*
169+
* If any attempt succeeds, it returns immediately. If all attempts fail,
170+
* it throws the last error encountered.
171+
*/
172+
173+
export function withRetry<T extends TestContext>(
174+
callback: (context: T) => Promise<void>,
175+
maxAttempts: number = 2,
176+
): (context: T) => Promise<void> {
177+
return async (context: T) => {
178+
let lastError;
179+
180+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
181+
try {
182+
await callback(context);
183+
return;
184+
} catch (error) {
185+
lastError = error;
186+
if (attempt < maxAttempts) {
187+
context.log(`Attempt ${attempt}/${maxAttempts} failed: ${error}. Retrying...`);
188+
}
189+
}
190+
}
191+
192+
throw lastError;
193+
};
194+
}
195+
166196
export interface DisableBootstrapContext {
167197
/**
168198
* Whether to disable creating the default bootstrap

packages/@aws-cdk-testing/cli-integ/tests/cli-integ-tests/cdk-assets/cdk-assets-docker-credential.integtest.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ import { GetCallerIdentityCommand } from '@aws-sdk/client-sts';
55
// eslint-disable-next-line import/no-relative-packages
66
import type { DockerDomainCredentialSource } from '../../../../../@aws-cdk/cdk-assets-lib/lib/private/docker-credentials';
77
import type { TestFixture } from '../../../lib';
8-
import { integTest, withDefaultFixture } from '../../../lib';
8+
import { integTest, withDefaultFixture, withRetry } from '../../../lib';
99

1010
jest.setTimeout(2 * 60 * 60_000); // Includes the time to acquire locks, worst-case single-threaded runtime
1111

1212
integTest(
1313
'docker-credential-cdk-assets can assume role and fetch ECR credentials',
14-
withDefaultFixture(async (fixture) => {
14+
withRetry(withDefaultFixture(async (fixture) => {
1515
const caller = await fixture.aws.sts.send(new GetCallerIdentityCommand({}));
1616

1717
const roleArn = await fixture.aws.temporaryRole('ecr-repo-role', [
@@ -35,7 +35,7 @@ integTest(
3535
// This role must have permissions to call `ecr:GetAuthorizationToken`
3636
assumeRoleArn: roleArn,
3737
});
38-
}),
38+
})),
3939
);
4040

4141
/*

0 commit comments

Comments
 (0)