Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
9524c1f
feat: add automatic S3 request retry with exponential backoff
allanrogerr May 4, 2026
a79aa11
fix: address Copilot review on retry mechanism
allanrogerr May 5, 2026
beead6e
fix: strengthen exception assertions in RetryTest and guard regionCac…
allanrogerr May 5, 2026
53da859
fix: replace instanceof assertions with typed catch to satisfy SpotBu…
allanrogerr May 5, 2026
60aeaed
fix: use ThreadLocalRandom for backoff jitter, disable OkHttp retry, …
allanrogerr May 5, 2026
d1c52e0
style: fix Spotless formatting violations in BaseS3Client
allanrogerr May 5, 2026
0d687ec
fix: propagate runAsync dispatch failure into retryFuture
allanrogerr May 5, 2026
34abd64
refactor: move retry to OkHttp interceptor, drop S3-code retry
allanrogerr May 7, 2026
1997b48
fix: narrow throws clauses in RetryTest to satisfy SpotBugs
allanrogerr May 7, 2026
b2cad44
Merge branch 'master' into feature/retry-mechanism
allanrogerr May 7, 2026
ba65312
refactor: strip retry to balamurugana's interceptor proposal scope
allanrogerr May 7, 2026
6d39a12
feat: restore full retry capability on top of OkHttp interceptor
allanrogerr May 7, 2026
756e294
docs(retry): clarify RetryInterceptor as supported API; add terminal-…
allanrogerr May 7, 2026
2f2fb4f
fix(retry): cancellation awareness, broader docs, tighter scope, more…
allanrogerr May 7, 2026
7fe6e87
fix(retry): drop public Javadoc links to package-private Retry; strip…
allanrogerr May 7, 2026
9a5c8e4
fix(retry): address bala review on PR #1701 review 4248622939
allanrogerr May 8, 2026
93e4f47
fix(retry): address bala review batch on PR #1701 review 4250022733
allanrogerr May 8, 2026
0ae488b
fix(retry): drop Retry.java; revert createBody bracket per r3206778779
allanrogerr May 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 40 additions & 12 deletions api/src/main/java/io/minio/BaseS3Client.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* MinIO Java SDK for Amazon S3 Compatible Cloud Storage,
* (C) 2025 MinIO, Inc.
* (C) 2025-2026 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -104,28 +104,54 @@ public abstract class BaseS3Client implements AutoCloseable {
private static final String UPLOAD_ID = "uploadId";
private static final Set<String> TRACE_QUERY_PARAMS =
ImmutableSet.of("retention", "legal-hold", "tagging", UPLOAD_ID, "acl", "attributes");

private PrintWriter traceStream;
protected final Map<String, String> regionCache = new ConcurrentHashMap<>();
protected String userAgent = Utils.getDefaultUserAgent();

protected Http.BaseUrl baseUrl;
protected Provider provider;
protected OkHttpClient httpClient;
protected volatile OkHttpClient httpClient;
protected boolean closeHttpClient;

/**
* Maximum attempts per S3 request. Effective only on the SDK-default {@link OkHttpClient};
* caller-supplied clients are used verbatim and the SDK does not modify their retry policy.
*/
volatile int maxRetries = Http.RetryInterceptor.MAX_RETRY;

protected BaseS3Client(
Http.BaseUrl baseUrl, Provider provider, OkHttpClient httpClient, boolean closeHttpClient) {
this.baseUrl = baseUrl;
this.provider = provider;
this.httpClient = httpClient;
this.closeHttpClient = closeHttpClient;
this.httpClient = httpClient;
}

/**
* Copies share the source's {@link OkHttpClient} (and its retry interceptor binding), so {@link
* #setMaxRetries(int)} on a copy does not affect the shared retry budget.
*/
protected BaseS3Client(BaseS3Client client) {
this.baseUrl = client.baseUrl;
this.provider = client.provider;
this.httpClient = client.httpClient;
this.closeHttpClient = client.closeHttpClient;
this.maxRetries = client.maxRetries;
this.httpClient = client.httpClient;
}

/**
* Sets the maximum number of attempts for transient HTTP failures. Pass {@code 1} to disable
* automatic retries. Default {@code 10}.
*
* <p>Effective only on the SDK-default {@link OkHttpClient}; caller-supplied clients are used
* verbatim.
*
* @param maxRetries maximum attempts (must be {@code >= 1}).
*/
public void setMaxRetries(int maxRetries) {
if (maxRetries < 1) throw new IllegalArgumentException("maxRetries must be >= 1");
this.maxRetries = maxRetries;
}

/** Closes underneath HTTP client. */
Expand Down Expand Up @@ -182,6 +208,11 @@ public void setAppInfo(String name, String version) {
/**
* Enables HTTP call tracing and written to traceStream.
*
* <p><b>Retry caveat.</b> Tracing happens at the SDK callback level, so only the final response
* of a retry sequence is recorded. Per-attempt traces (which the {@link Http.RetryInterceptor}
* sees and discards) are not surfaced here. To inspect individual retry attempts, register an
* OkHttp {@code HttpLoggingInterceptor} on a custom client.
*
* @param traceStream {@link OutputStream} for writing HTTP call tracing.
* @see #traceOff
*/
Expand Down Expand Up @@ -268,7 +299,10 @@ private String[] handleRedirectResponse(
return new String[] {code, message};
}

/** Execute HTTP request asynchronously for given parameters. */
/**
* Execute HTTP request asynchronously. Retry handling is delegated to {@link
* Http.RetryInterceptor} on the {@link OkHttpClient}, so this method itself never loops.
*/
protected CompletableFuture<Response> executeAsync(Http.S3Request s3request, String region) {
Credentials credentials = (provider == null) ? null : provider.fetch();
Http.Request request = null;
Expand All @@ -282,15 +316,9 @@ protected CompletableFuture<Response> executeAsync(Http.S3Request s3request, Str
PrintWriter traceStream = this.traceStream;
if (traceStream != null) traceStream.print(request.httpTraces());

OkHttpClient httpClient = this.httpClient;
Comment thread
allanrogerr marked this conversation as resolved.
// FIXME: enable retry for all request.
// if (!s3request.retryFailure()) {
// httpClient = httpClient.newBuilder().retryOnConnectionFailure(false).build();
// }

okhttp3.Request httpRequest = request.httpRequest();
CompletableFuture<Response> completableFuture = newCompleteableFuture();
httpClient
this.httpClient
.newCall(httpRequest)
.enqueue(
new Callback() {
Expand Down
222 changes: 218 additions & 4 deletions api/src/main/java/io/minio/Http.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* MinIO Java SDK for Amazon S3 Compatible Cloud Storage, (C) 2025 MinIO, Inc.
* MinIO Java SDK for Amazon S3 Compatible Cloud Storage, (C) 2025-2026 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,6 +35,8 @@
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.cert.CertPathBuilderException;
import java.security.cert.CertPathValidatorException;
import java.security.cert.CertificateException;
import java.security.cert.CertificateFactory;
import java.security.cert.X509Certificate;
Expand All @@ -52,19 +54,25 @@
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.IntSupplier;
import java.util.regex.Matcher;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.annotation.Nonnull;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.KeyManagerFactory;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;
import javax.net.ssl.SSLPeerUnverifiedException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.TrustManagerFactory;
import javax.net.ssl.X509TrustManager;
import okhttp3.Interceptor;
import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Protocol;
Expand Down Expand Up @@ -609,17 +617,28 @@ public static OkHttpClient enableExternalCertificatesFromEnv(OkHttpClient client
}

/**
* Creates new HTTP client with default timeout with additional TLS certificates from
* SSL_CERT_FILE and SSL_CERT_DIR environment variables if present.
* Creates the SDK's default HTTP client with a fixed retry budget of {@link
* RetryInterceptor#MAX_RETRY}. For a runtime-tunable budget, use {@link
* #newDefaultClient(IntSupplier)}.
*/
public static OkHttpClient newDefaultClient() {
return newDefaultClient(() -> RetryInterceptor.MAX_RETRY);
}

/**
* Creates the SDK's default HTTP client; the {@link RetryInterceptor}'s attempt budget is read
* from {@code maxAttemptsSupplier} on each call so it can track a runtime-tunable value.
*/
public static OkHttpClient newDefaultClient(IntSupplier maxAttemptsSupplier) {
OkHttpClient client =
new OkHttpClient()
.newBuilder()
.connectTimeout(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS)
.writeTimeout(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS)
.readTimeout(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS)
.protocols(Arrays.asList(Protocol.HTTP_1_1))
.retryOnConnectionFailure(false)
.addInterceptor(new RetryInterceptor(maxAttemptsSupplier))
.build();
try {
return enableExternalCertificatesFromEnv(client);
Expand Down Expand Up @@ -683,6 +702,194 @@ public static OkHttpClient setTimeout(
.build();
}

/**
* OkHttp interceptor that retries transient HTTP failures with full-jitter exponential backoff.
*
* <p>Installed automatically by {@link Http#newDefaultClient(IntSupplier)} on the SDK-default
* client. <em>Not</em> installed on caller-supplied clients passed via {@code
* MinioClient.Builder.httpClient(...)} — those are used verbatim. To opt into the SDK retry
* policy on a custom client, register this interceptor and pair it with {@code
* retryOnConnectionFailure(false)}.
*
* <p>Retries on:
*
* <ul>
* <li>retryable IOException — connection reset, EOF, socket timeout, idle-connection close.
* Excludes TLS handshake / unknown-CA / HTTPS protocol mismatch.
* <li>retryable HTTP status code — 408, 429, 499, 500, 502, 503, 504, 520.
* </ul>
*
* <p>Backoff is full-jitter exponential, with a 200&nbsp;ms unit and a 1&nbsp;s per-attempt cap.
* Attempt budget is read from an {@link IntSupplier} on every call so SDK clients can expose
* runtime tuning while the interceptor itself stays stateless.
*
* <p><b>Threading.</b> Backoff sleeps on the OkHttp dispatcher thread that owns the call; under
* sustained 5xx/429 storms this can hold dispatcher slots idle. Size {@link
* okhttp3.Dispatcher#setMaxRequests} / {@code setMaxRequestsPerHost} accordingly.
*
* <p><b>Cancellation.</b> {@code Call.cancel()} short-circuits the retry loop instead of being
* mistaken for a retryable transport error.
*
* <p><b>Replayability.</b> SDK-owned body types ({@link Body} over {@code byte[]}, {@link
* ByteBuffer}, {@link RandomAccessFile}) are retry-safe. A caller-supplied {@link
* okhttp3.RequestBody} that overrides {@code isOneShot()} to {@code true} MUST NOT be retried;
* either disable retries for those calls via {@link BaseS3Client#setMaxRetries(int)} {@code (1)}
* or wrap the body in a replayable form.
*
* <p><b>Request reuse.</b> Each attempt sends the same signed {@link okhttp3.Request}, so {@code
* X-Amz-Date}/{@code Authorization} are not refreshed. Harmless at the default budget; an extreme
* {@code maxRetries} combined with high backoff could outlast the 15-minute signing window or a
* short-lived STS credential. (minio-go re-signs per attempt; this Java implementation
* deliberately does not, in exchange for a simpler interceptor model.)
*/
public static class RetryInterceptor implements Interceptor {
Comment thread
allanrogerr marked this conversation as resolved.
/** Default maximum number of attempts per request. */
static final int MAX_RETRY = 10;

/** Base unit per retry attempt, in milliseconds. */
static final long DEFAULT_RETRY_UNIT_MS = 200L;

/** Per-attempt sleep cap, in milliseconds. */
static final long DEFAULT_RETRY_CAP_MS = 1_000L;

/** Maximum jitter fraction in {@code [0.0, 1.0]}. {@code 1.0} = full jitter. */
static final double MAX_JITTER = 1.0;

/** Retryable HTTP status codes. */
static final Set<Integer> RETRYABLE_HTTP_STATUS_CODES =
ImmutableSet.of(
408, // Request Timeout
429, // Too Many Requests
499, // Client Closed Request (nginx)
500, // Internal Server Error
502, // Bad Gateway
503, // Service Unavailable
504, // Gateway Timeout
520); // Cloudflare unknown error

static boolean isHttpStatusRetryable(int code) {
return RETRYABLE_HTTP_STATUS_CODES.contains(code);
}

/**
* Returns true if {@code e} represents a transient transport failure that should be retried.
* TLS handshake failure, unknown-CA / cert-path errors, and the "server gave HTTP response to
* HTTPS client" protocol mismatch are NOT retryable; everything else (connection reset, EOF,
* server closed idle connection, socket timeout, …) is.
*/
static boolean isRequestErrorRetryable(IOException e) {
if (e instanceof SSLHandshakeException) return false;
if (e instanceof SSLPeerUnverifiedException) return false;
if (e instanceof SSLException) {
Throwable cause = e.getCause();
if (cause instanceof CertPathBuilderException
|| cause instanceof CertPathValidatorException
|| cause instanceof CertificateException) {
return false;
}
}
String msg = e.getMessage();
if (msg != null && msg.contains("server gave HTTP response to HTTPS client")) return false;
return true;
}

/**
* Computes the exponential-backoff-with-full-jitter delay for retry {@code attempt} (0-indexed:
* {@code 0} = before the second attempt, {@code 1} = before the third, …):
*
* <pre>
* sleep = min(DEFAULT_RETRY_CAP_MS, DEFAULT_RETRY_UNIT_MS * 2^attempt)
* sleep -= (long)(random.nextDouble() * sleep * MAX_JITTER) // full jitter when MAX_JITTER == 1.0
* </pre>
*
* <p>With {@code MAX_JITTER == 1.0}, returns a value in {@code [1, min(cap, base *
* 2^attempt)]}. The lower bound is {@code 1} rather than {@code 0} because {@link
* java.util.concurrent.ThreadLocalRandom#nextDouble()} is in {@code [0.0, 1.0)} and the {@code
* (long)} cast truncates {@code rand * sleep} to at most {@code sleep - 1}. This matches the
* behaviour of minio-go's {@code exponentialBackoffWait}, which uses the same formula and
* therefore the same bounds.
*/
static long exponentialBackoffMs(int attempt) {
int exp = Math.min(Math.max(attempt, 0), 30);
long sleep = DEFAULT_RETRY_UNIT_MS * (1L << exp);
if (sleep > DEFAULT_RETRY_CAP_MS) sleep = DEFAULT_RETRY_CAP_MS;
sleep -= (long) (ThreadLocalRandom.current().nextDouble() * (double) sleep * MAX_JITTER);
return Math.max(0L, sleep);
}

private final IntSupplier maxAttemptsSupplier;

/** Uses a fixed budget of {@link #MAX_RETRY} attempts. */
public RetryInterceptor() {
this(() -> MAX_RETRY);
}

/**
* Reads the attempt budget from {@code maxAttemptsSupplier} on every call so it can track a
* runtime-tunable value. Values below 1 are clamped to 1 (= retry off).
*/
public RetryInterceptor(IntSupplier maxAttemptsSupplier) {
this.maxAttemptsSupplier = Objects.requireNonNull(maxAttemptsSupplier, "maxAttemptsSupplier");
}

@Override
public okhttp3.Response intercept(Chain chain) throws IOException {
okhttp3.Request request = chain.request();
int maxAttempts = Math.max(1, maxAttemptsSupplier.getAsInt());

okhttp3.Response response = null;
IOException lastException = null;

for (int attempt = 0; attempt < maxAttempts; attempt++) {
// Honour caller cancellation: mirrors minio-go's
// `errors.Is(err, context.Canceled)` short-circuit so the loop does not
// burn attempts re-issuing a call the caller has already abandoned.
if (chain.call().isCanceled()) {
if (response != null) response.close();
throw new IOException("Canceled");
}

if (attempt > 0) {
long delayMs = exponentialBackoffMs(attempt - 1);
if (delayMs > 0L) {
try {
Thread.sleep(delayMs);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new IOException("Retry interrupted", ie);
}
}
}

if (response != null) {
response.close();
response = null;
}

try {
response = chain.proceed(request);
} catch (IOException e) {
// A cancelled call surfaces as IOException with a message that would
// otherwise pass `isRequestErrorRetryable`. Bail out instead.
if (chain.call().isCanceled()) throw e;
if (!isRequestErrorRetryable(e)) throw e;
lastException = e;
continue;
}

if (isHttpStatusRetryable(response.code())) {
lastException = null;
continue;
}

return response;
}

if (lastException != null) throw lastException;
return response;
}
}

/** HTTP body of {@link RandomAccessFile}, {@link ByteBuffer} or {@link byte} array. */
public static class Body {
private okhttp3.RequestBody requestBody;
Expand All @@ -695,7 +902,14 @@ public static class Body {
private String md5Hash;
private boolean bodyString;

/** Creates Body for okhttp3 RequestBody. */
/**
* Creates Body for okhttp3 RequestBody.
*
* <p><b>Retry caveat.</b> {@link Http.RetryInterceptor} re-invokes {@code writeTo} on each
* retry. Pass only replayable bodies; do not pass a body that overrides {@code isOneShot()} to
* return {@code true} (or otherwise consumes its source on first write) unless retries are
* disabled for that call.
*/
public Body(okhttp3.RequestBody requestBody) {
this.requestBody = requestBody;
this.contentType = requestBody.contentType();
Expand Down
Loading
Loading