Skip to content

Commit 9f98c44

Browse files
authored
Add 5 min initial delay for undesired allocation warning (#112427)
When a cluster just starts up, all shards may reside on a single node. When a new node joins, it is likely half of the shards needs to relocate to the new node. This temporary undesired allocation is expected and should quickly resolve itself. This PR adds a 5 min initial delay so that the cluster does not log warning in such situation. Resolves: ES-9174
1 parent 265c704 commit 9f98c44

File tree

4 files changed

+27
-14
lines changed

4 files changed

+27
-14
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,10 @@ public class DesiredBalanceReconciler {
8989
private final DoubleGauge undesiredAllocationsRatio;
9090

9191
public DesiredBalanceReconciler(ClusterSettings clusterSettings, ThreadPool threadPool, MeterRegistry meterRegistry) {
92-
this.undesiredAllocationLogInterval = new FrequencyCappedAction(threadPool);
92+
this.undesiredAllocationLogInterval = new FrequencyCappedAction(
93+
threadPool.relativeTimeInMillisSupplier(),
94+
TimeValue.timeValueMinutes(5)
95+
);
9396
clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, this.undesiredAllocationLogInterval::setMinInterval);
9497
clusterSettings.initializeAndWatch(
9598
UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING,

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/FrequencyCappedAction.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
package org.elasticsearch.cluster.routing.allocation.allocator;
1010

1111
import org.elasticsearch.core.TimeValue;
12-
import org.elasticsearch.threadpool.ThreadPool;
1312

1413
import java.util.function.LongSupplier;
1514

@@ -21,15 +20,12 @@ public class FrequencyCappedAction {
2120
private final LongSupplier currentTimeMillisSupplier;
2221
private TimeValue minInterval;
2322

24-
private long next = -1;
23+
private long next;
2524

26-
public FrequencyCappedAction(ThreadPool threadPool) {
27-
this(threadPool.relativeTimeInMillisSupplier());
28-
}
29-
30-
public FrequencyCappedAction(LongSupplier currentTimeMillisSupplier) {
25+
public FrequencyCappedAction(LongSupplier currentTimeMillisSupplier, TimeValue initialDelay) {
3126
this.currentTimeMillisSupplier = currentTimeMillisSupplier;
3227
this.minInterval = TimeValue.MAX_VALUE;
28+
this.next = currentTimeMillisSupplier.getAsLong() + initialDelay.getMillis();
3329
}
3430

3531
public void setMinInterval(TimeValue minInterval) {

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import org.elasticsearch.common.settings.Settings;
5656
import org.elasticsearch.common.util.Maps;
5757
import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue;
58+
import org.elasticsearch.core.TimeValue;
5859
import org.elasticsearch.gateway.GatewayAllocator;
5960
import org.elasticsearch.index.IndexVersion;
6061
import org.elasticsearch.index.shard.ShardId;
@@ -1281,9 +1282,12 @@ public void testShouldLogOnTooManyUndesiredAllocations() {
12811282
.build();
12821283

12831284
var threadPool = mock(ThreadPool.class);
1284-
when(threadPool.relativeTimeInMillisSupplier()).thenReturn(new AtomicLong()::incrementAndGet);
1285+
final var timeInMillisSupplier = new AtomicLong();
1286+
when(threadPool.relativeTimeInMillisSupplier()).thenReturn(timeInMillisSupplier::incrementAndGet);
12851287

12861288
var reconciler = new DesiredBalanceReconciler(createBuiltInClusterSettings(), threadPool, mock(MeterRegistry.class));
1289+
final long initialDelayInMillis = TimeValue.timeValueMinutes(5).getMillis();
1290+
timeInMillisSupplier.addAndGet(randomLongBetween(initialDelayInMillis, 2 * initialDelayInMillis));
12871291

12881292
var expectedWarningMessage = "[100%] of assigned shards ("
12891293
+ shardCount
@@ -1323,7 +1327,9 @@ public void testShouldLogOnTooManyUndesiredAllocations() {
13231327
}
13241328

13251329
private static void reconcile(RoutingAllocation routingAllocation, DesiredBalance desiredBalance) {
1326-
new DesiredBalanceReconciler(createBuiltInClusterSettings(), mock(ThreadPool.class), mock(MeterRegistry.class)).reconcile(
1330+
final var threadPool = mock(ThreadPool.class);
1331+
when(threadPool.relativeTimeInMillisSupplier()).thenReturn(new AtomicLong()::incrementAndGet);
1332+
new DesiredBalanceReconciler(createBuiltInClusterSettings(), threadPool, mock(MeterRegistry.class)).reconcile(
13271333
desiredBalance,
13281334
routingAllocation
13291335
);

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/FrequencyCappedActionTests.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
package org.elasticsearch.cluster.routing.allocation.allocator;
1010

11+
import org.elasticsearch.core.TimeValue;
1112
import org.elasticsearch.test.ESTestCase;
1213

1314
import java.util.concurrent.atomic.AtomicLong;
@@ -21,22 +22,29 @@ public void testFrequencyCapExecution() {
2122

2223
var executions = new AtomicLong(0);
2324
var currentTime = new AtomicLong();
24-
var action = new FrequencyCappedAction(currentTime::get);
25+
final TimeValue initialDelay = randomBoolean() ? TimeValue.ZERO : TimeValue.timeValueSeconds(between(1, 300));
26+
var action = new FrequencyCappedAction(currentTime::get, initialDelay);
2527

2628
var minInterval = timeValueMillis(randomNonNegativeInt());
2729
action.setMinInterval(minInterval);
2830

29-
// initial execution should happen
3031
action.maybeExecute(executions::incrementAndGet);
32+
if (initialDelay != TimeValue.ZERO) {
33+
// Not executing due to initial delay
34+
assertThat(executions.get(), equalTo(0L));
35+
currentTime.addAndGet(randomLongBetween(initialDelay.millis(), initialDelay.millis() * 2));
36+
action.maybeExecute(executions::incrementAndGet);
37+
}
38+
// initial execution should happen
3139
assertThat(executions.get(), equalTo(1L));
3240

3341
// should not execute again too soon
34-
currentTime.set(randomLongBetween(0, minInterval.millis() - 1));
42+
currentTime.addAndGet(randomLongBetween(0, minInterval.millis() - 1));
3543
action.maybeExecute(executions::incrementAndGet);
3644
assertThat(executions.get(), equalTo(1L));
3745

3846
// should execute min interval elapsed
39-
currentTime.set(randomLongBetween(minInterval.millis(), Long.MAX_VALUE));
47+
currentTime.addAndGet(randomLongBetween(minInterval.millis(), Long.MAX_VALUE));
4048
action.maybeExecute(executions::incrementAndGet);
4149
assertThat(executions.get(), equalTo(2L));
4250
}

0 commit comments

Comments
 (0)