Skip to content

Commit 34a8107

Browse files
atakavciCopilot
andauthored
[automatic failover] Implement failback with given grace period (#4204)
* - weighted cluster seleciton - Healtstatus manager with initial listener and registration logic - pluggable health checker strategy introduced, these are draft NoOpStrategy, EchoStrategy, LagAwareStrategy, - fix failing tests impacted from weighted clusters * - add builder for ClusterConfig - add echo ot CommandObjects and UnifiedJEdis - improve StrategySupplier by accepting jedisclientconfig - adapt EchoStrategy to StrategySupplier. Now it handles the creation of connection by accepting endpoint and JedisClientConfig - make healthchecks disabled by default - drop noOpStrategy - add unit&integration tests for health check * - fix naming * clean up and mark override methods * fix link in javadoc * fix formatting * - fix double registered listeners in healtstatusmgr - clear redundant catch - replace failover options and drop failoveroptions class - remove forced_unhealthy from healthstatus - fix failback check - add disabled flag to cluster - update/fix related tests * Update src/main/java/redis/clients/jedis/mcf/EchoStrategy.java Co-authored-by: Copilot <[email protected]> * - add remove endpoints * - replace cluster disabled with failbackCandidate - replace failback enabled with failbacksupported in client - fix formatting - set defaults * - remove failback candidate - fix failing tests * - fix remove logic - fix failing tests * - periodic failback checks - introduce graceperiod - fix issue when CB is forced_open and gracePeriod is completed * - format * - fix timing issue --------- Co-authored-by: Copilot <[email protected]>
1 parent 8761136 commit 34a8107

File tree

7 files changed

+950
-25
lines changed

7 files changed

+950
-25
lines changed

src/main/java/redis/clients/jedis/MultiClusterClientConfig.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ public static interface StrategySupplier {
6363
private static final List<Class<? extends Throwable>> FALLBACK_EXCEPTIONS_DEFAULT = Arrays
6464
.asList(CallNotPermittedException.class, ConnectionFailoverException.class);
6565

66+
private static final long FAILBACK_CHECK_INTERVAL_DEFAULT = 5000; // 5 seconds
67+
private static final long GRACE_PERIOD_DEFAULT = 10000; // 10 seconds
68+
6669
private final ClusterConfig[] clusterConfigs;
6770

6871
//////////// Retry Config - https://resilience4j.readme.io/docs/retry ////////////
@@ -152,6 +155,12 @@ public static interface StrategySupplier {
152155
/** Whether failback is supported by client */
153156
private boolean isFailbackSupported;
154157

158+
/** Interval in milliseconds to wait before attempting failback to a recovered cluster */
159+
private long failbackCheckInterval;
160+
161+
/** Grace period in milliseconds to keep clusters disabled after they become unhealthy */
162+
private long gracePeriod;
163+
155164
public MultiClusterClientConfig(ClusterConfig[] clusterConfigs) {
156165
this.clusterConfigs = clusterConfigs;
157166
}
@@ -225,6 +234,14 @@ public boolean isFailbackSupported() {
225234
return isFailbackSupported;
226235
}
227236

237+
public long getFailbackCheckInterval() {
238+
return failbackCheckInterval;
239+
}
240+
241+
public long getGracePeriod() {
242+
return gracePeriod;
243+
}
244+
228245
public static Builder builder(ClusterConfig[] clusterConfigs) {
229246
return new Builder(clusterConfigs);
230247
}
@@ -358,6 +375,8 @@ public static class Builder {
358375

359376
private boolean retryOnFailover = false;
360377
private boolean isFailbackSupported = true;
378+
private long failbackCheckInterval = FAILBACK_CHECK_INTERVAL_DEFAULT;
379+
private long gracePeriod = GRACE_PERIOD_DEFAULT;
361380

362381
public Builder(ClusterConfig[] clusterConfigs) {
363382

@@ -461,6 +480,16 @@ public Builder failbackSupported(boolean supported) {
461480
return this;
462481
}
463482

483+
public Builder failbackCheckInterval(long failbackCheckInterval) {
484+
this.failbackCheckInterval = failbackCheckInterval;
485+
return this;
486+
}
487+
488+
public Builder gracePeriod(long gracePeriod) {
489+
this.gracePeriod = gracePeriod;
490+
return this;
491+
}
492+
464493
public MultiClusterClientConfig build() {
465494
MultiClusterClientConfig config = new MultiClusterClientConfig(this.clusterConfigs);
466495

@@ -488,6 +517,8 @@ public MultiClusterClientConfig build() {
488517

489518
config.retryOnFailover = this.retryOnFailover;
490519
config.isFailbackSupported = this.isFailbackSupported;
520+
config.failbackCheckInterval = this.failbackCheckInterval;
521+
config.gracePeriod = this.gracePeriod;
491522

492523
return config;
493524
}

src/main/java/redis/clients/jedis/mcf/CircuitBreakerFailoverBase.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import redis.clients.jedis.annots.Experimental;
77
import redis.clients.jedis.exceptions.JedisConnectionException;
88
import redis.clients.jedis.providers.MultiClusterPooledConnectionProvider;
9+
import redis.clients.jedis.providers.MultiClusterPooledConnectionProvider.Cluster;
910
import redis.clients.jedis.util.IOUtils;
1011

1112
/**
@@ -46,16 +47,24 @@ protected void clusterFailover(CircuitBreaker circuitBreaker) {
4647
// Transitions state machine to a FORCED_OPEN state, stopping state transition, metrics and
4748
// event publishing.
4849
// To recover/transition from this forced state the user will need to manually failback
50+
51+
Cluster activeCluster = provider.getCluster();
52+
// This should never happen in theory !!
53+
if (activeCluster.getCircuitBreaker() != circuitBreaker) throw new IllegalStateException(
54+
"A circuitbreaker failover can be triggered only by the active cluster!");
55+
56+
activeCluster.setGracePeriod();
4957
circuitBreaker.transitionToForcedOpenState();
5058

5159
// Iterating the active cluster will allow subsequent calls to the executeCommand() to use the next
5260
// cluster's connection pool - according to the configuration's prioritization/order/weight
5361
// int activeMultiClusterIndex = provider.incrementActiveMultiClusterIndex1();
54-
provider.iterateActiveCluster();
62+
if (provider.iterateActiveCluster() != null) {
5563

56-
// Implementation is optionally provided during configuration. Typically, used for
57-
// activeMultiClusterIndex persistence or custom logging
58-
provider.runClusterFailoverPostProcessor(provider.getCluster());
64+
// Implementation is optionally provided during configuration. Typically, used for
65+
// activeMultiClusterIndex persistence or custom logging
66+
provider.runClusterFailoverPostProcessor(provider.getCluster());
67+
}
5968
}
6069
// this check relies on the fact that many failover attempts can hit with the same CB,
6170
// only the first one will trigger a failover, and make the CB FORCED_OPEN.

src/main/java/redis/clients/jedis/providers/MultiClusterPooledConnectionProvider.java

Lines changed: 97 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
import java.util.concurrent.ConcurrentHashMap;
1717
import java.util.concurrent.locks.Lock;
1818
import java.util.concurrent.locks.ReentrantLock;
19+
import java.util.concurrent.ScheduledExecutorService;
20+
import java.util.concurrent.Executors;
21+
import java.util.concurrent.TimeUnit;
22+
1923
import java.util.function.Consumer;
2024
import java.util.function.Predicate;
2125

@@ -82,6 +86,12 @@ public class MultiClusterPooledConnectionProvider implements ConnectionProvider
8286

8387
private HealthStatusManager healthStatusManager = new HealthStatusManager();
8488

89+
// Failback mechanism fields
90+
private final ScheduledExecutorService failbackScheduler = Executors.newSingleThreadScheduledExecutor(r -> {
91+
Thread t = new Thread(r, "failback-scheduler");
92+
t.setDaemon(true);
93+
return t;
94+
});
8595
// Store retry and circuit breaker configs for dynamic cluster addition/removal
8696
private RetryConfig retryConfig;
8797
private CircuitBreakerConfig circuitBreakerConfig;
@@ -151,6 +161,13 @@ public MultiClusterPooledConnectionProvider(MultiClusterClientConfig multiCluste
151161
/// --- ///
152162

153163
this.fallbackExceptionList = multiClusterClientConfig.getFallbackExceptionList();
164+
165+
// Start periodic failback checker
166+
if (multiClusterClientConfig.isFailbackSupported()) {
167+
long failbackInterval = multiClusterClientConfig.getFailbackCheckInterval();
168+
failbackScheduler.scheduleAtFixedRate(this::periodicFailbackCheck, failbackInterval, failbackInterval,
169+
TimeUnit.MILLISECONDS);
170+
}
154171
}
155172

156173
/**
@@ -194,6 +211,7 @@ public void remove(Endpoint endpoint) {
194211
if (multiClusterMap.size() < 2) {
195212
throw new JedisValidationException("Cannot remove the last remaining endpoint");
196213
}
214+
log.debug("Removing endpoint {}", endpoint);
197215

198216
activeClusterIndexLock.lock();
199217
try {
@@ -251,7 +269,6 @@ private void addClusterInternal(MultiClusterClientConfig multiClusterClientConfi
251269
circuitBreakerEventPublisher.onError(event -> log.error(String.valueOf(event)));
252270
circuitBreakerEventPublisher.onFailureRateExceeded(event -> log.error(String.valueOf(event)));
253271
circuitBreakerEventPublisher.onSlowCallRateExceeded(event -> log.error(String.valueOf(event)));
254-
circuitBreakerEventPublisher.onStateTransition(event -> log.warn(String.valueOf(event)));
255272

256273
ConnectionPool pool;
257274
if (poolConfig != null) {
@@ -281,20 +298,51 @@ private void handleStatusChange(HealthStatusChangeEvent eventArgs) {
281298

282299
clusterWithHealthChange.setHealthStatus(newStatus);
283300

284-
if (newStatus.isHealthy()) {
285-
if (clusterWithHealthChange.isFailbackSupported() && activeCluster != clusterWithHealthChange) {
286-
// lets check if weighted switching is possible
287-
Map.Entry<Endpoint, Cluster> failbackCluster = findWeightedHealthyClusterToIterate();
288-
if (failbackCluster == clusterWithHealthChange
289-
&& clusterWithHealthChange.getWeight() > activeCluster.getWeight()) {
290-
setActiveCluster(clusterWithHealthChange, false);
301+
if (!newStatus.isHealthy()) {
302+
// Handle failover if this was the active cluster
303+
if (clusterWithHealthChange == activeCluster) {
304+
clusterWithHealthChange.setGracePeriod();
305+
if (iterateActiveCluster() != null) {
306+
this.runClusterFailoverPostProcessor(activeCluster);
291307
}
292308
}
293-
} else if (clusterWithHealthChange == activeCluster) {
294-
if (iterateActiveCluster() != null) {
295-
this.runClusterFailoverPostProcessor(activeCluster);
309+
}
310+
}
311+
312+
/**
313+
* Periodic failback checker - runs at configured intervals to check for failback opportunities
314+
*/
315+
private void periodicFailbackCheck() {
316+
// Find the best candidate cluster for failback
317+
Cluster bestCandidate = null;
318+
float bestWeight = activeCluster.getWeight();
319+
320+
for (Map.Entry<Endpoint, Cluster> entry : multiClusterMap.entrySet()) {
321+
Cluster cluster = entry.getValue();
322+
323+
// Skip if this is already the active cluster
324+
if (cluster == activeCluster) {
325+
continue;
326+
}
327+
328+
// Skip if cluster is not healthy
329+
if (!cluster.isHealthy()) {
330+
continue;
331+
}
332+
333+
// This cluster is a valid candidate
334+
if (cluster.getWeight() > bestWeight) {
335+
bestCandidate = cluster;
336+
bestWeight = cluster.getWeight();
296337
}
297338
}
339+
340+
// Perform failback if we found a better candidate
341+
if (bestCandidate != null) {
342+
log.info("Performing failback from {} to {} (higher weight cluster available)",
343+
activeCluster.getCircuitBreaker().getName(), bestCandidate.getCircuitBreaker().getName());
344+
setActiveCluster(bestCandidate, true);
345+
}
298346
}
299347

300348
public Endpoint iterateActiveCluster() {
@@ -397,7 +445,21 @@ private boolean setActiveCluster(Cluster cluster, boolean validateConnection) {
397445

398446
@Override
399447
public void close() {
400-
activeCluster.getConnectionPool().close();
448+
// Shutdown the failback scheduler
449+
failbackScheduler.shutdown();
450+
try {
451+
if (!failbackScheduler.awaitTermination(1, TimeUnit.SECONDS)) {
452+
failbackScheduler.shutdownNow();
453+
}
454+
} catch (InterruptedException e) {
455+
failbackScheduler.shutdownNow();
456+
Thread.currentThread().interrupt();
457+
}
458+
459+
// Close all cluster connection pools
460+
for (Cluster cluster : multiClusterMap.values()) {
461+
cluster.getConnectionPool().close();
462+
}
401463
}
402464

403465
@Override
@@ -425,26 +487,21 @@ public Cluster getCluster() {
425487
}
426488

427489
@VisibleForTesting
428-
public Cluster getCluster(Endpoint multiClusterIndex) {
429-
return multiClusterMap.get(multiClusterIndex);
490+
public Cluster getCluster(Endpoint endpoint) {
491+
return multiClusterMap.get(endpoint);
430492
}
431493

432494
public CircuitBreaker getClusterCircuitBreaker() {
433495
return activeCluster.getCircuitBreaker();
434496
}
435497

436-
public CircuitBreaker getClusterCircuitBreaker(int multiClusterIndex) {
437-
return activeCluster.getCircuitBreaker();
438-
}
439-
440498
/**
441499
* Indicates the final cluster/database endpoint (connection pool), according to the pre-configured list provided at
442500
* startup via the MultiClusterClientConfig, is unavailable and therefore no further failover is possible. Users can
443501
* manually failback to an available cluster
444502
*/
445503
public boolean canIterateOnceMore() {
446504
Map.Entry<Endpoint, Cluster> e = findWeightedHealthyClusterToIterate();
447-
448505
return e != null;
449506
}
450507

@@ -472,6 +529,9 @@ public static class Cluster {
472529
private MultiClusterClientConfig multiClusterClientConfig;
473530
private boolean disabled = false;
474531

532+
// Grace period tracking
533+
private volatile long gracePeriodEndsAt = 0;
534+
475535
public Cluster(ConnectionPool connectionPool, Retry retry, CircuitBreaker circuitBreaker, float weight,
476536
MultiClusterClientConfig multiClusterClientConfig) {
477537
this.connectionPool = connectionPool;
@@ -513,11 +573,14 @@ public float getWeight() {
513573
}
514574

515575
public boolean isCBForcedOpen() {
576+
if (circuitBreaker.getState() == State.FORCED_OPEN && !isInGracePeriod()) {
577+
circuitBreaker.transitionToClosedState();
578+
}
516579
return circuitBreaker.getState() == CircuitBreaker.State.FORCED_OPEN;
517580
}
518581

519582
public boolean isHealthy() {
520-
return healthStatus.isHealthy() && !isCBForcedOpen() && !disabled;
583+
return healthStatus.isHealthy() && !isCBForcedOpen() && !disabled && !isInGracePeriod();
521584
}
522585

523586
public boolean retryOnFailover() {
@@ -532,6 +595,20 @@ public void setDisabled(boolean disabled) {
532595
this.disabled = disabled;
533596
}
534597

598+
/**
599+
* Checks if the cluster is currently in grace period
600+
*/
601+
public boolean isInGracePeriod() {
602+
return System.currentTimeMillis() < gracePeriodEndsAt;
603+
}
604+
605+
/**
606+
* Sets the grace period for this cluster
607+
*/
608+
public void setGracePeriod() {
609+
gracePeriodEndsAt = System.currentTimeMillis() + multiClusterClientConfig.getGracePeriod();
610+
}
611+
535612
/**
536613
* Whether failback is supported by client
537614
*/

0 commit comments

Comments
 (0)