Skip to content

Commit 093e5a6

Browse files
abidhasan-awsgithub-actions
andauthored
fix(cli-integ): add retry for iam eventual consistency issue and migration tests for java (#788)
### Background The CDK pipelines have been experiencing intermittent failures due to flaky tests that typically pass on retry. This pull request addresses the investigation of the two most frequent failing tests. <img width="1307" height="530" alt="image (1)" src="https://github.com/user-attachments/assets/c03da25a-6921-4358-8a12-81db8722d437" /> ### AWS IAM Eventual Consistency Issue Test: `docker-credential-cdk-assets can assume role and fetch ECR credentials` Issue: Docker credential fetching fails with AccessDenied errors because newly created IAM roles and policies require time to propagate across AWS regions. Fix: Implemented a 60-second retry mechanism for `fetchDockerLoginCredentials()` when encountering AccessDenied errors. ### CDK Migration Test Instability Test: `cdk migrate java deploys successfully` Issue: Java CDK migration tests fail sporadically due to Maven Central repository rate limiting errors & dependency resolution failure Fix: Implemented full test retry logic as these transient network-related issues could not be reproduced in local environments. ### Impact These changes should improve pipeline stability and reduce the need for manual intervention. --- By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license --------- Signed-off-by: github-actions <[email protected]> Co-authored-by: github-actions <[email protected]>
1 parent c6585ad commit 093e5a6

File tree

3 files changed

+53
-33
lines changed

3 files changed

+53
-33
lines changed

packages/@aws-cdk-testing/cli-integ/lib/aws.ts

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -276,28 +276,14 @@ export class AwsClients {
276276
}
277277

278278
public async waitForAssumeRole(roleArn: string) {
279-
// Wait until the role has replicated
280-
const deadline = Date.now() + 60_000;
281-
let lastError: Error | undefined;
282-
while (Date.now() < deadline) {
283-
try {
284-
await this.sts.send(new AssumeRoleCommand({
285-
RoleArn: roleArn,
286-
RoleSessionName: 'test-existence',
287-
}));
288-
return;
289-
} catch (e: any) {
290-
lastError = e;
291-
292-
if (e.name === 'AccessDenied') {
293-
continue;
294-
}
295-
296-
throw e;
297-
}
298-
}
299-
300-
throw new Error(`Timed out waiting for role ${roleArn} to become assumable: ${lastError}`);
279+
await retryOnMatchingErrors(
280+
() => this.sts.send(new AssumeRoleCommand({
281+
RoleArn: roleArn,
282+
RoleSessionName: 'test-existence',
283+
})),
284+
['AccessDenied'],
285+
retry.forSeconds(60),
286+
);
301287
}
302288

303289
public async deleteRole(name: string) {
@@ -381,6 +367,36 @@ export async function sleep(ms: number) {
381367
return new Promise((ok) => setTimeout(ok, ms));
382368
}
383369

370+
/**
371+
* Retry an async operation with error filtering until a deadline is hit.
372+
*
373+
* Use `retry.forSeconds()` to construct a deadline relative to right now.
374+
*
375+
* Only retries on errors with matching names in errorNames array.
376+
*/
377+
export async function retryOnMatchingErrors<T>(
378+
operation: () => Promise<T>,
379+
errorNames: string[],
380+
deadline: Date,
381+
interval: number = 5000,
382+
): Promise<T> {
383+
let i = 0;
384+
while (true) {
385+
try {
386+
i++;
387+
return await operation();
388+
} catch (e: any) {
389+
if (Date.now() > deadline.getTime()) {
390+
throw new Error(`Operation did not succeed after ${i} attempts: ${e}`);
391+
}
392+
if (!errorNames.includes(e.name)) {
393+
throw e;
394+
}
395+
await sleep(interval);
396+
}
397+
}
398+
}
399+
384400
function chainableCredentials(region: string): AwsCredentialIdentityProvider {
385401
if ((process.env.CODEBUILD_BUILD_ARN || process.env.GITHUB_RUN_ID) && process.env.AWS_PROFILE) {
386402
// in codebuild we must assume the role that the cdk uses

packages/@aws-cdk-testing/cli-integ/tests/cli-integ-tests/cdk-assets/cdk-assets-docker-credential.integtest.ts

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { GetCallerIdentityCommand } from '@aws-sdk/client-sts';
55
// eslint-disable-next-line import/no-relative-packages
66
import type { DockerDomainCredentialSource } from '../../../../../@aws-cdk/cdk-assets-lib/lib/private/docker-credentials';
77
import type { TestFixture } from '../../../lib';
8-
import { integTest, withDefaultFixture, withRetry } from '../../../lib';
8+
import { integTest, withDefaultFixture, withRetry, retry } from '../../../lib';
99

1010
jest.setTimeout(2 * 60 * 60_000); // Includes the time to acquire locks, worst-case single-threaded runtime
1111

@@ -83,13 +83,17 @@ async function testDockerCredential(fixture: TestFixture, credSource: DockerDoma
8383
fs.writeFileSync(input, `${domain}\n`);
8484

8585
await fixture.cdkAssets.makeCliAvailable();
86-
const output = await fixture.shell(['docker-credential-cdk-assets', 'get'], {
87-
modEnv: {
88-
...fixture.cdkShellEnv(),
89-
CDK_DOCKER_CREDS_FILE: credsFilePath,
90-
},
91-
stdio: [fs.openSync(input, 'r')],
92-
captureStderr: false,
86+
let output: string = '';
87+
88+
await retry(process.stdout, 'Getting docker credentials', retry.forSeconds(60), async () => {
89+
output = await fixture.shell(['docker-credential-cdk-assets', 'get'], {
90+
modEnv: {
91+
...fixture.cdkShellEnv(),
92+
CDK_DOCKER_CREDS_FILE: credsFilePath,
93+
},
94+
stdio: [fs.openSync(input, 'r')],
95+
captureStderr: false,
96+
});
9397
});
9498

9599
const response = JSON.parse(output);
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import { deploysSuccessfully } from './testcase';
2-
import { integTest, withCDKMigrateFixture } from '../../../lib';
2+
import { integTest, withCDKMigrateFixture, withRetry } from '../../../lib';
33

44
const language = 'java';
55

66
jest.setTimeout(2 * 60 * 60_000); // Includes the time to acquire locks, worst-case single-threaded runtime
77

88
integTest(
99
`cdk migrate ${language} deploys successfully`,
10-
withCDKMigrateFixture(language, async (fixture) => {
10+
withRetry(withCDKMigrateFixture(language, async (fixture) => {
1111
await deploysSuccessfully(fixture, language);
12-
}),
12+
})),
1313
);

0 commit comments

Comments
 (0)