1010import datadog .trace .util .AgentThreadFactory ;
1111import java .io .IOException ;
1212import java .time .Duration ;
13+ import java .util .concurrent .ConcurrentHashMap ;
14+ import java .util .concurrent .ConcurrentMap ;
1315import java .util .concurrent .ExecutorService ;
1416import java .util .concurrent .Phaser ;
1517import java .util .concurrent .SynchronousQueue ;
@@ -56,7 +58,16 @@ public String getFileName() {
5658 }
5759 }
5860
59- private static final Logger log = LoggerFactory .getLogger (BatchUploader .class );
61+ public static class RetryPolicy {
62+ public final ConcurrentMap <Call , Integer > failures = new ConcurrentHashMap <>();
63+ public final int maxFailures ;
64+
65+ public RetryPolicy (int maxFailures ) {
66+ this .maxFailures = maxFailures ;
67+ }
68+ }
69+
70+ private static final Logger LOGGER = LoggerFactory .getLogger (BatchUploader .class );
6071 private static final int MINUTES_BETWEEN_ERROR_LOG = 5 ;
6172 private static final MediaType APPLICATION_JSON = MediaType .parse ("application/json" );
6273 private static final String HEADER_DD_CONTAINER_ID = "Datadog-Container-ID" ;
@@ -76,18 +87,28 @@ public String getFileName() {
7687 private final DebuggerMetrics debuggerMetrics ;
7788 private final boolean instrumentTheWorld ;
7889 private final RatelimitedLogger ratelimitedLogger ;
90+ private final RetryPolicy retryPolicy ;
7991
8092 private final Phaser inflightRequests = new Phaser (1 );
8193
82- public BatchUploader (Config config , String endpoint ) {
83- this (config , endpoint , new RatelimitedLogger (log , MINUTES_BETWEEN_ERROR_LOG , TimeUnit .MINUTES ));
94+ public BatchUploader (Config config , String endpoint , RetryPolicy retryPolicy ) {
95+ this (
96+ config ,
97+ endpoint ,
98+ new RatelimitedLogger (LOGGER , MINUTES_BETWEEN_ERROR_LOG , TimeUnit .MINUTES ),
99+ retryPolicy );
84100 }
85101
86- BatchUploader (Config config , String endpoint , RatelimitedLogger ratelimitedLogger ) {
102+ BatchUploader (
103+ Config config ,
104+ String endpoint ,
105+ RatelimitedLogger ratelimitedLogger ,
106+ RetryPolicy retryPolicy ) {
87107 this (
88108 config ,
89109 endpoint ,
90110 ratelimitedLogger ,
111+ retryPolicy ,
91112 ContainerInfo .get ().containerId ,
92113 ContainerInfo .getEntityId ());
93114 }
@@ -97,17 +118,17 @@ public BatchUploader(Config config, String endpoint) {
97118 Config config ,
98119 String endpoint ,
99120 RatelimitedLogger ratelimitedLogger ,
121+ RetryPolicy retryPolicy ,
100122 String containerId ,
101123 String entityId ) {
102124 instrumentTheWorld = config .isDebuggerInstrumentTheWorld ();
103125 if (endpoint == null || endpoint .length () == 0 ) {
104126 throw new IllegalArgumentException ("Endpoint url is empty" );
105127 }
106128 urlBase = HttpUrl .get (endpoint );
107- log .debug ("Started BatchUploader with target url {}" , urlBase );
129+ LOGGER .debug ("Started BatchUploader with target url {}" , urlBase );
108130 apiKey = config .getApiKey ();
109131 this .ratelimitedLogger = ratelimitedLogger ;
110- responseCallback = new ResponseCallback (ratelimitedLogger , inflightRequests );
111132 // This is the same thing OkHttp Dispatcher is doing except thread naming and daemonization
112133 okHttpExecutorService =
113134 new ThreadPoolExecutor (
@@ -117,6 +138,7 @@ public BatchUploader(Config config, String endpoint) {
117138 TimeUnit .SECONDS ,
118139 new SynchronousQueue <>(),
119140 new AgentThreadFactory (DEBUGGER_HTTP_DISPATCHER ));
141+ this .retryPolicy = retryPolicy ;
120142 this .containerId = containerId ;
121143 this .entityId = entityId ;
122144 Duration requestTimeout = Duration .ofSeconds (config .getDebuggerUploadTimeout ());
@@ -132,6 +154,8 @@ public BatchUploader(Config config, String endpoint) {
132154 null , /* proxyUsername */
133155 null , /* proxyPassword */
134156 requestTimeout .toMillis ());
157+ responseCallback =
158+ new ResponseCallback (ratelimitedLogger , inflightRequests , client , retryPolicy );
135159 debuggerMetrics = DebuggerMetrics .getInstance (config );
136160 }
137161
@@ -195,6 +219,10 @@ public HttpUrl getUrl() {
195219 return urlBase ;
196220 }
197221
222+ RetryPolicy getRetryPolicy () {
223+ return retryPolicy ;
224+ }
225+
198226 private void makeUploadRequest (byte [] json , String tags ) {
199227 int contentLength = json .length ;
200228 // use RequestBody.create(MediaType, byte[]) to avoid changing Content-Type to
@@ -205,8 +233,8 @@ private void makeUploadRequest(byte[] json, String tags) {
205233
206234 private void buildAndSendRequest (RequestBody body , int contentLength , String tags ) {
207235 debuggerMetrics .histogram ("batch.uploader.request.size" , contentLength );
208- if (log .isDebugEnabled ()) {
209- log .debug ("Uploading batch data size={} bytes" , contentLength );
236+ if (LOGGER .isDebugEnabled ()) {
237+ LOGGER .debug ("Uploading batch data size={} bytes" , contentLength );
210238 }
211239 HttpUrl .Builder builder = urlBase .newBuilder ();
212240 if (tags != null && !tags .isEmpty ()) {
@@ -215,17 +243,17 @@ private void buildAndSendRequest(RequestBody body, int contentLength, String tag
215243 Request .Builder requestBuilder = new Request .Builder ().url (builder .build ()).post (body );
216244 if (apiKey != null ) {
217245 if (apiKey .isEmpty ()) {
218- log .debug ("API key is empty" );
246+ LOGGER .debug ("API key is empty" );
219247 }
220248 if (apiKey .length () != 32 ) {
221- log .debug (
249+ LOGGER .debug (
222250 "API key length is incorrect (truncated?) expected=32 actual={} API key={}..." ,
223251 apiKey .length (),
224252 apiKey .substring (0 , Math .min (apiKey .length (), 6 )));
225253 }
226254 requestBuilder .addHeader (HEADER_DD_API_KEY , apiKey );
227255 } else {
228- log .debug ("API key is null" );
256+ LOGGER .debug ("API key is null" );
229257 }
230258 if (containerId != null ) {
231259 requestBuilder .addHeader (HEADER_DD_CONTAINER_ID , containerId );
@@ -234,24 +262,23 @@ private void buildAndSendRequest(RequestBody body, int contentLength, String tag
234262 requestBuilder .addHeader (HEADER_DD_ENTITY_ID , entityId );
235263 }
236264 Request request = requestBuilder .build ();
237- log .debug ("Sending request: {} CT: {}" , request , request .body ().contentType ());
238- client .newCall (request ).enqueue (responseCallback );
239- inflightRequests .register ();
265+ LOGGER .debug ("Sending request: {} CT: {}" , request , request .body ().contentType ());
266+ enqueueCall (client , request , responseCallback , retryPolicy , 0 , inflightRequests );
240267 }
241268
242269 public void shutdown () {
243270 try {
244271 inflightRequests .awaitAdvanceInterruptibly (inflightRequests .arrive (), 10 , TimeUnit .SECONDS );
245272 } catch (TimeoutException | InterruptedException ignored ) {
246- log .warn ("Not all upload requests have been handled" );
273+ LOGGER .warn ("Not all upload requests have been handled" );
247274 }
248275 okHttpExecutorService .shutdownNow ();
249276 try {
250277 okHttpExecutorService .awaitTermination (TERMINATION_TIMEOUT , TimeUnit .SECONDS );
251278 } catch (final InterruptedException e ) {
252279 // Note: this should only happen in main thread right before exiting, so eating up interrupted
253280 // state should be fine.
254- log .warn ("Wait for executor shutdown interrupted" );
281+ LOGGER .warn ("Wait for executor shutdown interrupted" );
255282 }
256283 client .connectionPool ().evictAll ();
257284 }
@@ -260,28 +287,68 @@ private boolean canEnqueueMoreRequests() {
260287 return client .dispatcher ().queuedCallsCount () < MAX_ENQUEUED_REQUESTS ;
261288 }
262289
290+ private static void enqueueCall (
291+ OkHttpClient client ,
292+ Request request ,
293+ Callback responseCallback ,
294+ RetryPolicy retryPolicy ,
295+ int failureCount ,
296+ Phaser inflightRequests ) {
297+ Call call = client .newCall (request );
298+ retryPolicy .failures .put (call , failureCount );
299+ call .enqueue (responseCallback );
300+ inflightRequests .register ();
301+ }
302+
263303 private static final class ResponseCallback implements Callback {
264304
265305 private final RatelimitedLogger ratelimitedLogger ;
266306 private final Phaser inflightRequests ;
307+ private final OkHttpClient client ;
308+ private final RetryPolicy retryPolicy ;
267309
268- public ResponseCallback (final RatelimitedLogger ratelimitedLogger , Phaser inflightRequests ) {
310+ public ResponseCallback (
311+ final RatelimitedLogger ratelimitedLogger ,
312+ Phaser inflightRequests ,
313+ OkHttpClient client ,
314+ RetryPolicy retryPolicy ) {
269315 this .ratelimitedLogger = ratelimitedLogger ;
270316 this .inflightRequests = inflightRequests ;
317+ this .client = client ;
318+ this .retryPolicy = retryPolicy ;
271319 }
272320
273321 @ Override
274- public void onFailure (final Call call , final IOException e ) {
322+ public void onFailure (Call call , IOException e ) {
275323 inflightRequests .arriveAndDeregister ();
276324 ratelimitedLogger .warn ("Failed to upload batch to {}" , call .request ().url (), e );
325+ handleRetry (call , retryPolicy .maxFailures );
326+ }
327+
328+ private void handleRetry (Call call , int maxFailures ) {
329+ Integer failure = retryPolicy .failures .remove (call );
330+ if (failure != null ) {
331+ int failureCount = failure + 1 ;
332+ if (failureCount <= maxFailures ) {
333+ LOGGER .debug (
334+ "Retrying upload to {}, {}/{}" , call .request ().url (), failureCount , maxFailures );
335+ enqueueCall (client , call .request (), this , retryPolicy , failureCount , inflightRequests );
336+ } else {
337+ LOGGER .warn (
338+ "Failed permanently to upload batch to {} after {} attempts" ,
339+ call .request ().url (),
340+ maxFailures );
341+ }
342+ }
277343 }
278344
279345 @ Override
280- public void onResponse (final Call call , final Response response ) {
346+ public void onResponse (Call call , Response response ) {
281347 try {
282348 inflightRequests .arriveAndDeregister ();
283349 if (response .isSuccessful ()) {
284- log .debug ("Upload done" );
350+ LOGGER .debug ("Upload done" );
351+ retryPolicy .failures .remove (call );
285352 } else {
286353 ResponseBody body = response .body ();
287354 // Retrieve body content for detailed error messages
@@ -301,6 +368,11 @@ public void onResponse(final Call call, final Response response) {
301368 response .message (),
302369 response .code ());
303370 }
371+ if (response .code () >= 500 || response .code () == 408 || response .code () == 429 ) {
372+ handleRetry (call , retryPolicy .maxFailures );
373+ } else {
374+ retryPolicy .failures .remove (call );
375+ }
304376 }
305377 } finally {
306378 response .close ();
0 commit comments