Skip to content

Commit 6a34b61

Browse files
Implement telemetry and global per-JVM limit for auto test retries (#7458)
1 parent bd05bf0 commit 6a34b61

File tree

11 files changed

+182
-8
lines changed

11 files changed

+182
-8
lines changed

dd-java-agent/agent-ci-visibility/src/main/java/datadog/trace/civisibility/domain/TestImpl.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import datadog.trace.api.civisibility.telemetry.tag.BrowserDriver;
1818
import datadog.trace.api.civisibility.telemetry.tag.EventType;
1919
import datadog.trace.api.civisibility.telemetry.tag.IsNew;
20+
import datadog.trace.api.civisibility.telemetry.tag.IsRetry;
2021
import datadog.trace.api.civisibility.telemetry.tag.IsRum;
2122
import datadog.trace.api.civisibility.telemetry.tag.TestFrameworkInstrumentation;
2223
import datadog.trace.api.gateway.RequestContextSlot;
@@ -245,6 +246,7 @@ public void end(@Nullable Long endTime) {
245246
instrumentation,
246247
EventType.TEST,
247248
span.getTag(Tags.TEST_IS_NEW) != null ? IsNew.TRUE : null,
249+
span.getTag(Tags.TEST_IS_RETRY) != null ? IsRetry.TRUE : null,
248250
span.getTag(Tags.TEST_IS_RUM_ACTIVE) != null ? IsRum.TRUE : null,
249251
CIConstants.SELENIUM_BROWSER_DRIVER.equals(span.getTag(Tags.TEST_BROWSER_DRIVER))
250252
? BrowserDriver.SELENIUM

dd-java-agent/agent-ci-visibility/src/main/java/datadog/trace/civisibility/domain/buildsystem/ProxyTestModule.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public class ProxyTestModule implements TestFrameworkModule {
6666
private final Collection<TestIdentifier> knownTests;
6767
private final EarlyFlakeDetectionSettings earlyFlakeDetectionSettings;
6868
private final AtomicInteger earlyFlakeDetectionsUsed = new AtomicInteger(0);
69+
private final AtomicInteger autoRetriesUsed = new AtomicInteger(0);
6970
private final Collection<TestFramework> testFrameworks = ConcurrentHashMap.newKeySet();
7071

7172
public ProxyTestModule(
@@ -142,8 +143,9 @@ public TestRetryPolicy retryPolicy(TestIdentifier test) {
142143
return new RetryNTimes(earlyFlakeDetectionSettings);
143144
}
144145
if (flakyTestRetriesEnabled
145-
&& (flakyTests == null || flakyTests.contains(test.withoutParameters()))) {
146-
return new RetryIfFailed(config.getCiVisibilityFlakyRetryCount());
146+
&& (flakyTests == null || flakyTests.contains(test.withoutParameters()))
147+
&& autoRetriesUsed.get() < config.getCiVisibilityTotalFlakyRetryCount()) {
148+
return new RetryIfFailed(config.getCiVisibilityFlakyRetryCount(), autoRetriesUsed);
147149
}
148150
}
149151
return NeverRetry.INSTANCE;

dd-java-agent/agent-ci-visibility/src/main/java/datadog/trace/civisibility/domain/headless/HeadlessTestModule.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public class HeadlessTestModule extends AbstractTestModule implements TestFramew
5151
private final Collection<TestIdentifier> knownTests;
5252
private final EarlyFlakeDetectionSettings earlyFlakeDetectionSettings;
5353
private final AtomicInteger earlyFlakeDetectionsUsed = new AtomicInteger(0);
54+
private final AtomicInteger autoRetriesUsed = new AtomicInteger(0);
5455
private final boolean codeCoverageEnabled;
5556
private final boolean testSkippingEnabled;
5657

@@ -131,8 +132,9 @@ public TestRetryPolicy retryPolicy(TestIdentifier test) {
131132
return new RetryNTimes(earlyFlakeDetectionSettings);
132133
}
133134
if (flakyTestRetriesEnabled
134-
&& (flakyTests == null || flakyTests.contains(test.withoutParameters()))) {
135-
return new RetryIfFailed(config.getCiVisibilityFlakyRetryCount());
135+
&& (flakyTests == null || flakyTests.contains(test.withoutParameters()))
136+
&& autoRetriesUsed.get() < config.getCiVisibilityTotalFlakyRetryCount()) {
137+
return new RetryIfFailed(config.getCiVisibilityFlakyRetryCount(), autoRetriesUsed);
136138
}
137139
}
138140
return NeverRetry.INSTANCE;

dd-java-agent/agent-ci-visibility/src/main/java/datadog/trace/civisibility/retry/RetryIfFailed.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
package datadog.trace.civisibility.retry;
22

33
import datadog.trace.api.civisibility.retry.TestRetryPolicy;
4+
import java.util.concurrent.atomic.AtomicInteger;
45

56
/** Retries a test case if it failed, up to a maximum number of times. */
67
public class RetryIfFailed implements TestRetryPolicy {
78

89
private final int maxExecutions;
910
private int executions;
1011

11-
public RetryIfFailed(int maxExecutions) {
12+
/** Total execution counter that is shared by all retry policies */
13+
private final AtomicInteger totalExecutions;
14+
15+
public RetryIfFailed(int maxExecutions, AtomicInteger totalExecutions) {
1216
this.maxExecutions = maxExecutions;
17+
this.totalExecutions = totalExecutions;
1318
this.executions = 0;
1419
}
1520

@@ -25,7 +30,12 @@ public boolean suppressFailures() {
2530

2631
@Override
2732
public boolean retry(boolean successful, long duration) {
28-
return !successful && ++executions < maxExecutions;
33+
if (!successful && ++executions < maxExecutions) {
34+
totalExecutions.incrementAndGet();
35+
return true;
36+
} else {
37+
return false;
38+
}
2939
}
3040

3141
@Override

dd-java-agent/agent-ci-visibility/src/test/groovy/datadog/trace/civisibility/TestImplTest.groovy renamed to dd-java-agent/agent-ci-visibility/src/test/groovy/datadog/trace/civisibility/domain/TestImplTest.groovy

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package datadog.trace.civisibility
1+
package datadog.trace.civisibility.domain
22

33
import datadog.trace.agent.test.asserts.ListWriterAssert
44
import datadog.trace.agent.tooling.TracerInstaller
@@ -11,9 +11,9 @@ import datadog.trace.api.civisibility.coverage.CoverageStore
1111
import datadog.trace.api.civisibility.coverage.NoOpCoverageStore
1212
import datadog.trace.api.civisibility.telemetry.tag.TestFrameworkInstrumentation
1313
import datadog.trace.bootstrap.instrumentation.api.AgentTracer
14+
import datadog.trace.civisibility.InstrumentationType
1415
import datadog.trace.civisibility.codeowners.NoCodeowners
1516
import datadog.trace.civisibility.decorator.TestDecoratorImpl
16-
import datadog.trace.civisibility.domain.TestImpl
1717
import datadog.trace.civisibility.source.MethodLinesResolver
1818
import datadog.trace.civisibility.source.NoOpSourcePathResolver
1919
import datadog.trace.civisibility.telemetry.CiVisibilityMetricCollectorImpl
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package datadog.trace.civisibility.domain.buildsystem
2+
3+
import datadog.trace.api.Config
4+
import datadog.trace.api.civisibility.config.EarlyFlakeDetectionSettings
5+
import datadog.trace.api.civisibility.config.ModuleExecutionSettings
6+
import datadog.trace.api.civisibility.config.TestIdentifier
7+
import datadog.trace.api.civisibility.coverage.CoverageDataSupplier
8+
import datadog.trace.api.civisibility.coverage.CoverageStore
9+
import datadog.trace.api.civisibility.telemetry.CiVisibilityMetricCollector
10+
import datadog.trace.civisibility.codeowners.Codeowners
11+
import datadog.trace.civisibility.decorator.TestDecorator
12+
import datadog.trace.civisibility.ipc.SignalClient
13+
import datadog.trace.civisibility.source.MethodLinesResolver
14+
import datadog.trace.civisibility.source.SourcePathResolver
15+
import datadog.trace.test.util.DDSpecification
16+
17+
class ProxyTestModuleTest extends DDSpecification {
18+
19+
def "test total retries limit is applied across test cases"() {
20+
def moduleExecutionSettings = Stub(ModuleExecutionSettings)
21+
moduleExecutionSettings.getEarlyFlakeDetectionSettings() >> EarlyFlakeDetectionSettings.DEFAULT
22+
moduleExecutionSettings.isFlakyTestRetriesEnabled() >> true
23+
moduleExecutionSettings.getFlakyTests(_) >> null
24+
25+
def config = Stub(Config)
26+
config.getCiVisibilityFlakyRetryCount() >> 2 // this counts all executions of a test case (first attempt is counted too)
27+
config.getCiVisibilityTotalFlakyRetryCount() >> 2 // this counts retries across all tests (first attempt is not a retry, so it is not counted)
28+
29+
given:
30+
def proxyTestModule = new ProxyTestModule(
31+
1L,
32+
1L,
33+
"test-module",
34+
moduleExecutionSettings,
35+
config,
36+
Stub(CiVisibilityMetricCollector),
37+
Stub(TestDecorator),
38+
Stub(SourcePathResolver),
39+
Stub(Codeowners),
40+
Stub(MethodLinesResolver),
41+
Stub(CoverageStore.Factory),
42+
Stub(CoverageDataSupplier),
43+
GroovyMock(SignalClient.Factory)
44+
)
45+
46+
when:
47+
def retryPolicy1 = proxyTestModule.retryPolicy(new TestIdentifier("suite", "test-1", null, null))
48+
49+
then:
50+
retryPolicy1.retry(false, 1L) // 2nd test execution, 1st retry globally
51+
!retryPolicy1.retry(false, 1L) // asking for 3rd test execution - local limit reached
52+
53+
when:
54+
def retryPolicy2 = proxyTestModule.retryPolicy(new TestIdentifier("suite", "test-2", null, null))
55+
56+
then:
57+
retryPolicy2.retry(false, 1L) // 2nd test execution, 2nd retry globally (since previous test was retried too)
58+
!retryPolicy2.retry(false, 1L) // asking for 3rd test execution - local limit reached
59+
60+
when:
61+
def retryPolicy3 = proxyTestModule.retryPolicy(new TestIdentifier("suite", "test-3", null, null))
62+
63+
then:
64+
!retryPolicy3.retry(false, 1L) // asking for 3rd retry globally - global limit reached
65+
}
66+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package datadog.trace.civisibility.domain.headless
2+
3+
import datadog.trace.api.Config
4+
import datadog.trace.api.civisibility.config.EarlyFlakeDetectionSettings
5+
import datadog.trace.api.civisibility.config.ModuleExecutionSettings
6+
import datadog.trace.api.civisibility.config.TestIdentifier
7+
import datadog.trace.api.civisibility.coverage.CoverageStore
8+
import datadog.trace.api.civisibility.telemetry.CiVisibilityMetricCollector
9+
import datadog.trace.bootstrap.instrumentation.api.AgentSpan
10+
import datadog.trace.civisibility.codeowners.Codeowners
11+
import datadog.trace.civisibility.decorator.TestDecorator
12+
import datadog.trace.civisibility.source.MethodLinesResolver
13+
import datadog.trace.civisibility.source.SourcePathResolver
14+
import datadog.trace.test.util.DDSpecification
15+
16+
class HeadlessTestModuleTest extends DDSpecification {
17+
18+
def "test total retries limit is applied across test cases"() {
19+
def moduleExecutionSettings = Stub(ModuleExecutionSettings)
20+
moduleExecutionSettings.getEarlyFlakeDetectionSettings() >> EarlyFlakeDetectionSettings.DEFAULT
21+
moduleExecutionSettings.isFlakyTestRetriesEnabled() >> true
22+
moduleExecutionSettings.getFlakyTests(_) >> null
23+
24+
def config = Stub(Config)
25+
config.getCiVisibilityFlakyRetryCount() >> 2 // this counts all executions of a test case (first attempt is counted too)
26+
config.getCiVisibilityTotalFlakyRetryCount() >> 2 // this counts retries across all tests (first attempt is not a retry, so it is not counted)
27+
28+
given:
29+
def headlessTestModule = new HeadlessTestModule(
30+
Stub(AgentSpan.Context),
31+
1L,
32+
"test-module",
33+
null,
34+
config,
35+
Stub(CiVisibilityMetricCollector),
36+
Stub(TestDecorator),
37+
Stub(SourcePathResolver),
38+
Stub(Codeowners),
39+
Stub(MethodLinesResolver),
40+
Stub(CoverageStore.Factory),
41+
moduleExecutionSettings,
42+
(span) -> {}
43+
)
44+
45+
when:
46+
def retryPolicy1 = headlessTestModule.retryPolicy(new TestIdentifier("suite", "test-1", null, null))
47+
48+
then:
49+
retryPolicy1.retry(false, 1L) // 2nd test execution, 1st retry globally
50+
!retryPolicy1.retry(false, 1L) // asking for 3rd test execution - local limit reached
51+
52+
when:
53+
def retryPolicy2 = headlessTestModule.retryPolicy(new TestIdentifier("suite", "test-2", null, null))
54+
55+
then:
56+
retryPolicy2.retry(false, 1L) // 2nd test execution, 2nd retry globally (since previous test was retried too)
57+
!retryPolicy2.retry(false, 1L) // asking for 3rd test execution - local limit reached
58+
59+
when:
60+
def retryPolicy3 = headlessTestModule.retryPolicy(new TestIdentifier("suite", "test-3", null, null))
61+
62+
then:
63+
!retryPolicy3.retry(false, 1L) // asking for 3rd retry globally - global limit reached
64+
}
65+
66+
67+
}

dd-trace-api/src/main/java/datadog/trace/api/config/CiVisibilityConfig.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ public final class CiVisibilityConfig {
6060
public static final String CIVISIBILITY_EARLY_FLAKE_DETECTION_LOWER_LIMIT =
6161
"civisibility.early.flake.detection.lower.limit";
6262
public static final String CIVISIBILITY_FLAKY_RETRY_COUNT = "civisibility.flaky.retry.count";
63+
public static final String CIVISIBILITY_TOTAL_FLAKY_RETRY_COUNT =
64+
"civisibility.total.flaky.retry.count";
6365
public static final String CIVISIBILITY_MODULE_NAME = "civisibility.module.name";
6466
public static final String CIVISIBILITY_TELEMETRY_ENABLED = "civisibility.telemetry.enabled";
6567
public static final String CIVISIBILITY_RUM_FLUSH_WAIT_MILLIS =

internal-api/src/main/java/datadog/trace/api/Config.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@
212212
import static datadog.trace.api.config.CiVisibilityConfig.CIVISIBILITY_SOURCE_DATA_ENABLED;
213213
import static datadog.trace.api.config.CiVisibilityConfig.CIVISIBILITY_TELEMETRY_ENABLED;
214214
import static datadog.trace.api.config.CiVisibilityConfig.CIVISIBILITY_TEST_SKIPPING_ENABLED;
215+
import static datadog.trace.api.config.CiVisibilityConfig.CIVISIBILITY_TOTAL_FLAKY_RETRY_COUNT;
215216
import static datadog.trace.api.config.CiVisibilityConfig.CIVISIBILITY_TRACE_SANITATION_ENABLED;
216217
import static datadog.trace.api.config.CrashTrackingConfig.CRASH_TRACKING_AGENTLESS;
217218
import static datadog.trace.api.config.CrashTrackingConfig.CRASH_TRACKING_AGENTLESS_DEFAULT;
@@ -836,6 +837,7 @@ public static String getHostName() {
836837
private final boolean ciVisibilityFlakyRetryEnabled;
837838
private final boolean ciVisibilityFlakyRetryOnlyKnownFlakes;
838839
private final int ciVisibilityFlakyRetryCount;
840+
private final int ciVisibilityTotalFlakyRetryCount;
839841
private final boolean ciVisibilityEarlyFlakeDetectionEnabled;
840842
private final int ciVisibilityEarlyFlakeDetectionLowerLimit;
841843
private final String ciVisibilityModuleName;
@@ -1898,6 +1900,8 @@ PROFILING_DATADOG_PROFILER_ENABLED, isDatadogProfilerSafeInCurrentEnvironment())
18981900
ciVisibilityEarlyFlakeDetectionLowerLimit =
18991901
configProvider.getInteger(CIVISIBILITY_EARLY_FLAKE_DETECTION_LOWER_LIMIT, 30);
19001902
ciVisibilityFlakyRetryCount = configProvider.getInteger(CIVISIBILITY_FLAKY_RETRY_COUNT, 5);
1903+
ciVisibilityTotalFlakyRetryCount =
1904+
configProvider.getInteger(CIVISIBILITY_TOTAL_FLAKY_RETRY_COUNT, 1000);
19011905
ciVisibilityModuleName = configProvider.getString(CIVISIBILITY_MODULE_NAME);
19021906
ciVisibilityTelemetryEnabled = configProvider.getBoolean(CIVISIBILITY_TELEMETRY_ENABLED, true);
19031907
ciVisibilityRumFlushWaitMillis =
@@ -3239,6 +3243,10 @@ public int getCiVisibilityFlakyRetryCount() {
32393243
return ciVisibilityFlakyRetryCount;
32403244
}
32413245

3246+
public int getCiVisibilityTotalFlakyRetryCount() {
3247+
return ciVisibilityTotalFlakyRetryCount;
3248+
}
3249+
32423250
public String getCiVisibilityModuleName() {
32433251
return ciVisibilityModuleName;
32443252
}

internal-api/src/main/java/datadog/trace/api/civisibility/telemetry/CiVisibilityCountMetric.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import datadog.trace.api.civisibility.telemetry.tag.IsBenchmark;
1616
import datadog.trace.api.civisibility.telemetry.tag.IsHeadless;
1717
import datadog.trace.api.civisibility.telemetry.tag.IsNew;
18+
import datadog.trace.api.civisibility.telemetry.tag.IsRetry;
1819
import datadog.trace.api.civisibility.telemetry.tag.IsRum;
1920
import datadog.trace.api.civisibility.telemetry.tag.IsUnsupportedCI;
2021
import datadog.trace.api.civisibility.telemetry.tag.ItrEnabled;
@@ -53,6 +54,7 @@ public enum CiVisibilityCountMetric {
5354
IsBenchmark.class,
5455
EarlyFlakeDetectionAbortReason.class,
5556
IsNew.class,
57+
IsRetry.class,
5658
IsRum.class,
5759
BrowserDriver.class),
5860
/** The number of successfully collected code coverages that are empty */

0 commit comments

Comments
 (0)