Skip to content

Commit 893bd2d

Browse files
authored
Add retry policy for uploading requests to agent (#7824)
Add a mechanism to retry on network failure or 500 response code. The retry policy store the call made to OkHttp to track number of retries made and the maximum retries allowed by failure. 0 retry are allowed for snapshots, 10 for probe statuses & symdb
1 parent 48a5921 commit 893bd2d

File tree

10 files changed

+205
-36
lines changed

10 files changed

+205
-36
lines changed

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/DebuggerAgent.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,10 @@ private static DebuggerSink createDebuggerSink(Config config, ProbeStatusSink pr
167167
String tags = getDefaultTagsMergedWithGlobalTags(config);
168168
SnapshotSink snapshotSink =
169169
new SnapshotSink(
170-
config, tags, new BatchUploader(config, config.getFinalDebuggerSnapshotUrl()));
170+
config,
171+
tags,
172+
new BatchUploader(
173+
config, config.getFinalDebuggerSnapshotUrl(), SnapshotSink.RETRY_POLICY));
171174
SymbolSink symbolSink = new SymbolSink(config);
172175
return new DebuggerSink(
173176
config,

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/DebuggerTransformer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,10 @@ public DebuggerTransformer(Config config, Configuration configuration) {
152152
DebuggerMetrics.getInstance(config),
153153
new ProbeStatusSink(config, config.getFinalDebuggerSnapshotUrl(), false),
154154
new SnapshotSink(
155-
config, "", new BatchUploader(config, config.getFinalDebuggerSnapshotUrl())),
155+
config,
156+
"",
157+
new BatchUploader(
158+
config, config.getFinalDebuggerSnapshotUrl(), SnapshotSink.RETRY_POLICY)),
156159
new SymbolSink(config)));
157160
}
158161

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ public DebuggerSink(Config config, ProbeStatusSink probeStatusSink) {
4343
DebuggerMetrics.getInstance(config),
4444
probeStatusSink,
4545
new SnapshotSink(
46-
config, null, new BatchUploader(config, config.getFinalDebuggerSnapshotUrl())),
46+
config,
47+
null,
48+
new BatchUploader(
49+
config, config.getFinalDebuggerSnapshotUrl(), SnapshotSink.RETRY_POLICY)),
4750
new SymbolSink(config));
4851
}
4952

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/ProbeStatusSink.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public class ProbeStatusSink {
3333
private static final JsonAdapter<ProbeStatus> PROBE_STATUS_ADAPTER =
3434
MoshiHelper.createMoshiProbeStatus().adapter(ProbeStatus.class);
3535
private static final int MINUTES_BETWEEN_ERROR_LOG = 5;
36+
public static final BatchUploader.RetryPolicy RETRY_POLICY = new BatchUploader.RetryPolicy(10);
3637

3738
private final BatchUploader diagnosticUploader;
3839
private final Builder messageBuilder;
@@ -46,7 +47,7 @@ public class ProbeStatusSink {
4647
private final boolean useMultiPart;
4748

4849
public ProbeStatusSink(Config config, String diagnosticsEndpoint, boolean useMultiPart) {
49-
this(config, new BatchUploader(config, diagnosticsEndpoint), useMultiPart);
50+
this(config, new BatchUploader(config, diagnosticsEndpoint, RETRY_POLICY), useMultiPart);
5051
}
5152

5253
ProbeStatusSink(Config config, BatchUploader diagnosticUploader, boolean useMultiPart) {

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/SnapshotSink.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public class SnapshotSink {
3030
private static final int HIGH_RATE_25_PERCENT_CAPACITY = HIGH_RATE_CAPACITY / 4;
3131
private static final int HIGH_RATE_75_PERCENT_CAPACITY = HIGH_RATE_CAPACITY * 3 / 4;
3232
static final long HIGH_RATE_STEP_SIZE = 10;
33+
public static final BatchUploader.RetryPolicy RETRY_POLICY = new BatchUploader.RetryPolicy(0);
3334

3435
private final BlockingQueue<Snapshot> lowRateSnapshots =
3536
new ArrayBlockingQueue<>(LOW_RATE_CAPACITY);

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/SymbolSink.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ public class SymbolSink {
2020

2121
private static final Logger LOGGER = LoggerFactory.getLogger(SymbolSink.class);
2222
static final int CAPACITY = 1024;
23+
public static final BatchUploader.RetryPolicy RETRY_POLICY = new BatchUploader.RetryPolicy(10);
2324
private static final JsonAdapter<ServiceVersion> SERVICE_VERSION_ADAPTER =
2425
MoshiHelper.createMoshiSymbol().adapter(ServiceVersion.class);
2526
private static final String EVENT_FORMAT =
@@ -38,7 +39,7 @@ public class SymbolSink {
3839
private final Stats stats = new Stats();
3940

4041
public SymbolSink(Config config) {
41-
this(config, new BatchUploader(config, config.getFinalDebuggerSymDBUrl()));
42+
this(config, new BatchUploader(config, config.getFinalDebuggerSymDBUrl(), RETRY_POLICY));
4243
}
4344

4445
SymbolSink(Config config, BatchUploader symbolUploader) {

dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java

Lines changed: 92 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import datadog.trace.util.AgentThreadFactory;
1111
import java.io.IOException;
1212
import java.time.Duration;
13+
import java.util.concurrent.ConcurrentHashMap;
14+
import java.util.concurrent.ConcurrentMap;
1315
import java.util.concurrent.ExecutorService;
1416
import java.util.concurrent.Phaser;
1517
import java.util.concurrent.SynchronousQueue;
@@ -56,7 +58,16 @@ public String getFileName() {
5658
}
5759
}
5860

59-
private static final Logger log = LoggerFactory.getLogger(BatchUploader.class);
61+
public static class RetryPolicy {
62+
public final ConcurrentMap<Call, Integer> failures = new ConcurrentHashMap<>();
63+
public final int maxFailures;
64+
65+
public RetryPolicy(int maxFailures) {
66+
this.maxFailures = maxFailures;
67+
}
68+
}
69+
70+
private static final Logger LOGGER = LoggerFactory.getLogger(BatchUploader.class);
6071
private static final int MINUTES_BETWEEN_ERROR_LOG = 5;
6172
private static final MediaType APPLICATION_JSON = MediaType.parse("application/json");
6273
private static final String HEADER_DD_CONTAINER_ID = "Datadog-Container-ID";
@@ -76,18 +87,28 @@ public String getFileName() {
7687
private final DebuggerMetrics debuggerMetrics;
7788
private final boolean instrumentTheWorld;
7889
private final RatelimitedLogger ratelimitedLogger;
90+
private final RetryPolicy retryPolicy;
7991

8092
private final Phaser inflightRequests = new Phaser(1);
8193

82-
public BatchUploader(Config config, String endpoint) {
83-
this(config, endpoint, new RatelimitedLogger(log, MINUTES_BETWEEN_ERROR_LOG, TimeUnit.MINUTES));
94+
public BatchUploader(Config config, String endpoint, RetryPolicy retryPolicy) {
95+
this(
96+
config,
97+
endpoint,
98+
new RatelimitedLogger(LOGGER, MINUTES_BETWEEN_ERROR_LOG, TimeUnit.MINUTES),
99+
retryPolicy);
84100
}
85101

86-
BatchUploader(Config config, String endpoint, RatelimitedLogger ratelimitedLogger) {
102+
BatchUploader(
103+
Config config,
104+
String endpoint,
105+
RatelimitedLogger ratelimitedLogger,
106+
RetryPolicy retryPolicy) {
87107
this(
88108
config,
89109
endpoint,
90110
ratelimitedLogger,
111+
retryPolicy,
91112
ContainerInfo.get().containerId,
92113
ContainerInfo.getEntityId());
93114
}
@@ -97,17 +118,17 @@ public BatchUploader(Config config, String endpoint) {
97118
Config config,
98119
String endpoint,
99120
RatelimitedLogger ratelimitedLogger,
121+
RetryPolicy retryPolicy,
100122
String containerId,
101123
String entityId) {
102124
instrumentTheWorld = config.isDebuggerInstrumentTheWorld();
103125
if (endpoint == null || endpoint.length() == 0) {
104126
throw new IllegalArgumentException("Endpoint url is empty");
105127
}
106128
urlBase = HttpUrl.get(endpoint);
107-
log.debug("Started BatchUploader with target url {}", urlBase);
129+
LOGGER.debug("Started BatchUploader with target url {}", urlBase);
108130
apiKey = config.getApiKey();
109131
this.ratelimitedLogger = ratelimitedLogger;
110-
responseCallback = new ResponseCallback(ratelimitedLogger, inflightRequests);
111132
// This is the same thing OkHttp Dispatcher is doing except thread naming and daemonization
112133
okHttpExecutorService =
113134
new ThreadPoolExecutor(
@@ -117,6 +138,7 @@ public BatchUploader(Config config, String endpoint) {
117138
TimeUnit.SECONDS,
118139
new SynchronousQueue<>(),
119140
new AgentThreadFactory(DEBUGGER_HTTP_DISPATCHER));
141+
this.retryPolicy = retryPolicy;
120142
this.containerId = containerId;
121143
this.entityId = entityId;
122144
Duration requestTimeout = Duration.ofSeconds(config.getDebuggerUploadTimeout());
@@ -132,6 +154,8 @@ public BatchUploader(Config config, String endpoint) {
132154
null, /* proxyUsername */
133155
null, /* proxyPassword */
134156
requestTimeout.toMillis());
157+
responseCallback =
158+
new ResponseCallback(ratelimitedLogger, inflightRequests, client, retryPolicy);
135159
debuggerMetrics = DebuggerMetrics.getInstance(config);
136160
}
137161

@@ -195,6 +219,10 @@ public HttpUrl getUrl() {
195219
return urlBase;
196220
}
197221

222+
RetryPolicy getRetryPolicy() {
223+
return retryPolicy;
224+
}
225+
198226
private void makeUploadRequest(byte[] json, String tags) {
199227
int contentLength = json.length;
200228
// use RequestBody.create(MediaType, byte[]) to avoid changing Content-Type to
@@ -205,8 +233,8 @@ private void makeUploadRequest(byte[] json, String tags) {
205233

206234
private void buildAndSendRequest(RequestBody body, int contentLength, String tags) {
207235
debuggerMetrics.histogram("batch.uploader.request.size", contentLength);
208-
if (log.isDebugEnabled()) {
209-
log.debug("Uploading batch data size={} bytes", contentLength);
236+
if (LOGGER.isDebugEnabled()) {
237+
LOGGER.debug("Uploading batch data size={} bytes", contentLength);
210238
}
211239
HttpUrl.Builder builder = urlBase.newBuilder();
212240
if (tags != null && !tags.isEmpty()) {
@@ -215,17 +243,17 @@ private void buildAndSendRequest(RequestBody body, int contentLength, String tag
215243
Request.Builder requestBuilder = new Request.Builder().url(builder.build()).post(body);
216244
if (apiKey != null) {
217245
if (apiKey.isEmpty()) {
218-
log.debug("API key is empty");
246+
LOGGER.debug("API key is empty");
219247
}
220248
if (apiKey.length() != 32) {
221-
log.debug(
249+
LOGGER.debug(
222250
"API key length is incorrect (truncated?) expected=32 actual={} API key={}...",
223251
apiKey.length(),
224252
apiKey.substring(0, Math.min(apiKey.length(), 6)));
225253
}
226254
requestBuilder.addHeader(HEADER_DD_API_KEY, apiKey);
227255
} else {
228-
log.debug("API key is null");
256+
LOGGER.debug("API key is null");
229257
}
230258
if (containerId != null) {
231259
requestBuilder.addHeader(HEADER_DD_CONTAINER_ID, containerId);
@@ -234,24 +262,23 @@ private void buildAndSendRequest(RequestBody body, int contentLength, String tag
234262
requestBuilder.addHeader(HEADER_DD_ENTITY_ID, entityId);
235263
}
236264
Request request = requestBuilder.build();
237-
log.debug("Sending request: {} CT: {}", request, request.body().contentType());
238-
client.newCall(request).enqueue(responseCallback);
239-
inflightRequests.register();
265+
LOGGER.debug("Sending request: {} CT: {}", request, request.body().contentType());
266+
enqueueCall(client, request, responseCallback, retryPolicy, 0, inflightRequests);
240267
}
241268

242269
public void shutdown() {
243270
try {
244271
inflightRequests.awaitAdvanceInterruptibly(inflightRequests.arrive(), 10, TimeUnit.SECONDS);
245272
} catch (TimeoutException | InterruptedException ignored) {
246-
log.warn("Not all upload requests have been handled");
273+
LOGGER.warn("Not all upload requests have been handled");
247274
}
248275
okHttpExecutorService.shutdownNow();
249276
try {
250277
okHttpExecutorService.awaitTermination(TERMINATION_TIMEOUT, TimeUnit.SECONDS);
251278
} catch (final InterruptedException e) {
252279
// Note: this should only happen in main thread right before exiting, so eating up interrupted
253280
// state should be fine.
254-
log.warn("Wait for executor shutdown interrupted");
281+
LOGGER.warn("Wait for executor shutdown interrupted");
255282
}
256283
client.connectionPool().evictAll();
257284
}
@@ -260,28 +287,68 @@ private boolean canEnqueueMoreRequests() {
260287
return client.dispatcher().queuedCallsCount() < MAX_ENQUEUED_REQUESTS;
261288
}
262289

290+
private static void enqueueCall(
291+
OkHttpClient client,
292+
Request request,
293+
Callback responseCallback,
294+
RetryPolicy retryPolicy,
295+
int failureCount,
296+
Phaser inflightRequests) {
297+
Call call = client.newCall(request);
298+
retryPolicy.failures.put(call, failureCount);
299+
call.enqueue(responseCallback);
300+
inflightRequests.register();
301+
}
302+
263303
private static final class ResponseCallback implements Callback {
264304

265305
private final RatelimitedLogger ratelimitedLogger;
266306
private final Phaser inflightRequests;
307+
private final OkHttpClient client;
308+
private final RetryPolicy retryPolicy;
267309

268-
public ResponseCallback(final RatelimitedLogger ratelimitedLogger, Phaser inflightRequests) {
310+
public ResponseCallback(
311+
final RatelimitedLogger ratelimitedLogger,
312+
Phaser inflightRequests,
313+
OkHttpClient client,
314+
RetryPolicy retryPolicy) {
269315
this.ratelimitedLogger = ratelimitedLogger;
270316
this.inflightRequests = inflightRequests;
317+
this.client = client;
318+
this.retryPolicy = retryPolicy;
271319
}
272320

273321
@Override
274-
public void onFailure(final Call call, final IOException e) {
322+
public void onFailure(Call call, IOException e) {
275323
inflightRequests.arriveAndDeregister();
276324
ratelimitedLogger.warn("Failed to upload batch to {}", call.request().url(), e);
325+
handleRetry(call, retryPolicy.maxFailures);
326+
}
327+
328+
private void handleRetry(Call call, int maxFailures) {
329+
Integer failure = retryPolicy.failures.remove(call);
330+
if (failure != null) {
331+
int failureCount = failure + 1;
332+
if (failureCount <= maxFailures) {
333+
LOGGER.debug(
334+
"Retrying upload to {}, {}/{}", call.request().url(), failureCount, maxFailures);
335+
enqueueCall(client, call.request(), this, retryPolicy, failureCount, inflightRequests);
336+
} else {
337+
LOGGER.warn(
338+
"Failed permanently to upload batch to {} after {} attempts",
339+
call.request().url(),
340+
maxFailures);
341+
}
342+
}
277343
}
278344

279345
@Override
280-
public void onResponse(final Call call, final Response response) {
346+
public void onResponse(Call call, Response response) {
281347
try {
282348
inflightRequests.arriveAndDeregister();
283349
if (response.isSuccessful()) {
284-
log.debug("Upload done");
350+
LOGGER.debug("Upload done");
351+
retryPolicy.failures.remove(call);
285352
} else {
286353
ResponseBody body = response.body();
287354
// Retrieve body content for detailed error messages
@@ -301,6 +368,11 @@ public void onResponse(final Call call, final Response response) {
301368
response.message(),
302369
response.code());
303370
}
371+
if (response.code() >= 500 || response.code() == 408 || response.code() == 429) {
372+
handleRetry(call, retryPolicy.maxFailures);
373+
} else {
374+
retryPolicy.failures.remove(call);
375+
}
304376
}
305377
} finally {
306378
response.close();

dd-java-agent/agent-debugger/src/test/java/com/datadog/debugger/sink/DebuggerSinkTest.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,10 @@ public void skipSnapshot() {
483483
DebuggerMetrics debuggerMetrics = spy(DebuggerMetrics.getInstance(config));
484484
SnapshotSink snapshotSink =
485485
new SnapshotSink(
486-
config, "", new BatchUploader(config, config.getFinalDebuggerSnapshotUrl()));
486+
config,
487+
"",
488+
new BatchUploader(
489+
config, config.getFinalDebuggerSnapshotUrl(), SnapshotSink.RETRY_POLICY));
487490
SymbolSink symbolSink = new SymbolSink(config);
488491
DebuggerSink sink =
489492
new DebuggerSink(config, "", debuggerMetrics, probeStatusSink, snapshotSink, symbolSink);

dd-java-agent/agent-debugger/src/test/java/com/datadog/debugger/sink/SymbolSinkTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ static class SymbolUploaderMock extends BatchUploader {
8585
final List<MultiPartContent> multiPartContents = new ArrayList<>();
8686

8787
public SymbolUploaderMock() {
88-
super(Config.get(), "http://localhost");
88+
super(Config.get(), "http://localhost", SymbolSink.RETRY_POLICY);
8989
}
9090

9191
@Override

0 commit comments

Comments
 (0)