Skip to content

Commit 2a9a3a4

Browse files
authored
Add a not-master state for desired balance (#116904)
The new state prevents a long running desired balance computation to set result after the node stands down as master.
1 parent cf9687f commit 2a9a3a4

File tree

6 files changed

+112
-39
lines changed

6 files changed

+112
-39
lines changed

docs/changelog/116904.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 116904
2+
summary: Add a not-master state for desired balance
3+
area: Allocation
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,14 @@ public DesiredBalance(long lastConvergedIndex, Map<ShardId, ShardAssignment> ass
4040
this(lastConvergedIndex, assignments, Map.of(), ComputationFinishReason.CONVERGED);
4141
}
4242

43-
public static final DesiredBalance INITIAL = new DesiredBalance(-1, Map.of());
43+
/**
44+
* The placeholder value for {@link DesiredBalance} when the node stands down as master.
45+
*/
46+
public static final DesiredBalance NOT_MASTER = new DesiredBalance(-2, Map.of());
47+
/**
48+
* The starting value for {@link DesiredBalance} when the node becomes the master.
49+
*/
50+
public static final DesiredBalance BECOME_MASTER_INITIAL = new DesiredBalance(-1, Map.of());
4451

4552
public ShardAssignment getAssignment(ShardId shardId) {
4653
return assignments.get(shardId);

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocator.java

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.elasticsearch.cluster.service.MasterService;
3030
import org.elasticsearch.cluster.service.MasterServiceTaskQueue;
3131
import org.elasticsearch.common.Priority;
32+
import org.elasticsearch.common.Strings;
3233
import org.elasticsearch.common.metrics.CounterMetric;
3334
import org.elasticsearch.common.metrics.MeanMetric;
3435
import org.elasticsearch.common.settings.ClusterSettings;
@@ -43,6 +44,7 @@
4344
import java.util.Set;
4445
import java.util.concurrent.ConcurrentLinkedQueue;
4546
import java.util.concurrent.atomic.AtomicLong;
47+
import java.util.concurrent.atomic.AtomicReference;
4648

4749
/**
4850
* A {@link ShardsAllocator} which asynchronously refreshes the desired balance held by the {@link DesiredBalanceComputer} and then takes
@@ -62,7 +64,7 @@ public class DesiredBalanceShardsAllocator implements ShardsAllocator {
6264
private final AtomicLong indexGenerator = new AtomicLong(-1);
6365
private final ConcurrentLinkedQueue<List<MoveAllocationCommand>> pendingDesiredBalanceMoves = new ConcurrentLinkedQueue<>();
6466
private final MasterServiceTaskQueue<ReconcileDesiredBalanceTask> masterServiceTaskQueue;
65-
private volatile DesiredBalance currentDesiredBalance = DesiredBalance.INITIAL;
67+
private final AtomicReference<DesiredBalance> currentDesiredBalanceRef = new AtomicReference<>(DesiredBalance.NOT_MASTER);
6668
private volatile boolean resetCurrentDesiredBalance = false;
6769
private final Set<String> processedNodeShutdowns = new HashSet<>();
6870
private final DesiredBalanceMetrics desiredBalanceMetrics;
@@ -129,6 +131,12 @@ protected void processInput(DesiredBalanceInput desiredBalanceInput) {
129131
long index = desiredBalanceInput.index();
130132
logger.debug("Starting desired balance computation for [{}]", index);
131133

134+
final DesiredBalance initialDesiredBalance = getInitialDesiredBalance();
135+
if (initialDesiredBalance == DesiredBalance.NOT_MASTER) {
136+
logger.debug("Abort desired balance computation because node is no longer master");
137+
return;
138+
}
139+
132140
recordTime(
133141
cumulativeComputationTime,
134142
// We set currentDesiredBalance back to INITIAL when the node stands down as master in onNoLongerMaster.
@@ -137,7 +145,7 @@ protected void processInput(DesiredBalanceInput desiredBalanceInput) {
137145
// lead to unexpected behaviours for tests. See also https://github.com/elastic/elasticsearch/pull/116904
138146
() -> setCurrentDesiredBalance(
139147
desiredBalanceComputer.compute(
140-
getInitialDesiredBalance(),
148+
initialDesiredBalance,
141149
desiredBalanceInput,
142150
pendingDesiredBalanceMoves,
143151
this::isFresh
@@ -146,7 +154,17 @@ protected void processInput(DesiredBalanceInput desiredBalanceInput) {
146154
);
147155
computationsExecuted.inc();
148156

149-
if (currentDesiredBalance.finishReason() == DesiredBalance.ComputationFinishReason.STOP_EARLY) {
157+
final DesiredBalance currentDesiredBalance = currentDesiredBalanceRef.get();
158+
if (currentDesiredBalance == DesiredBalance.NOT_MASTER || currentDesiredBalance == DesiredBalance.BECOME_MASTER_INITIAL) {
159+
logger.debug(
160+
() -> Strings.format(
161+
"Desired balance computation for [%s] is discarded since master has concurrently changed. "
162+
+ "Current desiredBalance=[%s]",
163+
index,
164+
currentDesiredBalance
165+
)
166+
);
167+
} else if (currentDesiredBalance.finishReason() == DesiredBalance.ComputationFinishReason.STOP_EARLY) {
150168
logger.debug(
151169
"Desired balance computation for [{}] terminated early with partial result, scheduling reconciliation",
152170
index
@@ -164,10 +182,13 @@ protected void processInput(DesiredBalanceInput desiredBalanceInput) {
164182
}
165183

166184
private DesiredBalance getInitialDesiredBalance() {
185+
final DesiredBalance currentDesiredBalance = currentDesiredBalanceRef.get();
167186
if (resetCurrentDesiredBalance) {
168187
logger.info("Resetting current desired balance");
169188
resetCurrentDesiredBalance = false;
170-
return new DesiredBalance(currentDesiredBalance.lastConvergedIndex(), Map.of());
189+
return currentDesiredBalance == DesiredBalance.NOT_MASTER
190+
? DesiredBalance.NOT_MASTER
191+
: new DesiredBalance(currentDesiredBalance.lastConvergedIndex(), Map.of());
171192
} else {
172193
return currentDesiredBalance;
173194
}
@@ -215,6 +236,10 @@ public void allocate(RoutingAllocation allocation, ActionListener<Void> listener
215236
var index = indexGenerator.incrementAndGet();
216237
logger.debug("Executing allocate for [{}]", index);
217238
queue.add(index, listener);
239+
// This can only run on master, so unset not-master if exists
240+
if (currentDesiredBalanceRef.compareAndSet(DesiredBalance.NOT_MASTER, DesiredBalance.BECOME_MASTER_INITIAL)) {
241+
logger.debug("initialized desired balance for becoming master");
242+
}
218243
desiredBalanceComputation.onNewInput(DesiredBalanceInput.create(index, allocation));
219244

220245
if (allocation.routingTable().indicesRouting().isEmpty()) {
@@ -224,7 +249,7 @@ public void allocate(RoutingAllocation allocation, ActionListener<Void> listener
224249
// Starts reconciliation towards desired balance that might have not been updated with a recent calculation yet.
225250
// This is fine as balance should have incremental rather than radical changes.
226251
// This should speed up achieving the desired balance in cases current state is still different from it (due to THROTTLING).
227-
reconcile(currentDesiredBalance, allocation);
252+
reconcile(currentDesiredBalanceRef.get(), allocation);
228253
}
229254

230255
private void processNodeShutdowns(ClusterState clusterState) {
@@ -267,16 +292,26 @@ private static List<MoveAllocationCommand> getMoveCommands(AllocationCommands co
267292
}
268293

269294
private void setCurrentDesiredBalance(DesiredBalance newDesiredBalance) {
270-
if (logger.isTraceEnabled()) {
271-
var diff = DesiredBalance.hasChanges(currentDesiredBalance, newDesiredBalance)
272-
? "Diff: " + DesiredBalance.humanReadableDiff(currentDesiredBalance, newDesiredBalance)
273-
: "No changes";
274-
logger.trace("Desired balance updated: {}. {}", newDesiredBalance, diff);
275-
} else {
276-
logger.debug("Desired balance updated for [{}]", newDesiredBalance.lastConvergedIndex());
295+
while (true) {
296+
final var oldDesiredBalance = currentDesiredBalanceRef.get();
297+
if (oldDesiredBalance == DesiredBalance.NOT_MASTER) {
298+
logger.debug("discard desired balance for [{}] since node is no longer master", newDesiredBalance.lastConvergedIndex());
299+
return;
300+
}
301+
302+
if (currentDesiredBalanceRef.compareAndSet(oldDesiredBalance, newDesiredBalance)) {
303+
if (logger.isTraceEnabled()) {
304+
var diff = DesiredBalance.hasChanges(oldDesiredBalance, newDesiredBalance)
305+
? "Diff: " + DesiredBalance.humanReadableDiff(oldDesiredBalance, newDesiredBalance)
306+
: "No changes";
307+
logger.trace("Desired balance updated: {}. {}", newDesiredBalance, diff);
308+
} else {
309+
logger.debug("Desired balance updated for [{}]", newDesiredBalance.lastConvergedIndex());
310+
}
311+
computedShardMovements.inc(DesiredBalance.shardMovements(oldDesiredBalance, newDesiredBalance));
312+
break;
313+
}
277314
}
278-
computedShardMovements.inc(DesiredBalance.shardMovements(currentDesiredBalance, newDesiredBalance));
279-
currentDesiredBalance = newDesiredBalance;
280315
}
281316

282317
protected void submitReconcileTask(DesiredBalance desiredBalance) {
@@ -316,7 +351,7 @@ public void execute(RoutingAllocation allocation) {
316351
}
317352

318353
public DesiredBalance getDesiredBalance() {
319-
return currentDesiredBalance;
354+
return currentDesiredBalanceRef.get();
320355
}
321356

322357
public void resetDesiredBalance() {
@@ -325,7 +360,7 @@ public void resetDesiredBalance() {
325360

326361
public DesiredBalanceStats getStats() {
327362
return new DesiredBalanceStats(
328-
Math.max(currentDesiredBalance.lastConvergedIndex(), 0L),
363+
Math.max(currentDesiredBalanceRef.get().lastConvergedIndex(), 0L),
329364
desiredBalanceComputation.isActive(),
330365
computationsSubmitted.count(),
331366
computationsExecuted.count(),
@@ -342,7 +377,7 @@ public DesiredBalanceStats getStats() {
342377

343378
private void onNoLongerMaster() {
344379
if (indexGenerator.getAndSet(-1) != -1) {
345-
currentDesiredBalance = DesiredBalance.INITIAL;
380+
currentDesiredBalanceRef.set(DesiredBalance.NOT_MASTER);
346381
queue.completeAllAsNotMaster();
347382
pendingDesiredBalanceMoves.clear();
348383
desiredBalanceReconciler.clear();
@@ -412,7 +447,7 @@ private static void discardSupersededTasks(
412447

413448
// only for tests - in production, this happens after reconciliation
414449
protected final void completeToLastConvergedIndex() {
415-
queue.complete(currentDesiredBalance.lastConvergedIndex());
450+
queue.complete(currentDesiredBalanceRef.get().lastConvergedIndex());
416451
}
417452

418453
private void recordTime(CounterMetric metric, Runnable action) {

server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/TransportDeleteDesiredBalanceActionTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ public DesiredBalance compute(
136136
safeAwait((ActionListener<Void> listener) -> allocationService.reroute(clusterState, "inital-allocate", listener));
137137

138138
var balanceBeforeReset = allocator.getDesiredBalance();
139-
assertThat(balanceBeforeReset.lastConvergedIndex(), greaterThan(DesiredBalance.INITIAL.lastConvergedIndex()));
139+
assertThat(balanceBeforeReset.lastConvergedIndex(), greaterThan(DesiredBalance.BECOME_MASTER_INITIAL.lastConvergedIndex()));
140140
assertThat(balanceBeforeReset.assignments(), not(anEmptyMap()));
141141

142142
var listener = new PlainActionFuture<ActionResponse.Empty>();

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,12 @@ public void testComputeBalance() {
9696
var clusterState = createInitialClusterState(3);
9797
var index = clusterState.metadata().index(TEST_INDEX).getIndex();
9898

99-
var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true);
99+
var desiredBalance = desiredBalanceComputer.compute(
100+
DesiredBalance.BECOME_MASTER_INITIAL,
101+
createInput(clusterState),
102+
queue(),
103+
input -> true
104+
);
100105

101106
assertDesiredAssignments(
102107
desiredBalance,
@@ -115,7 +120,7 @@ public void testStopsComputingWhenStale() {
115120
var index = clusterState.metadata().index(TEST_INDEX).getIndex();
116121

117122
// if the isFresh flag is false then we only do one iteration, allocating the primaries but not the replicas
118-
var desiredBalance0 = DesiredBalance.INITIAL;
123+
var desiredBalance0 = DesiredBalance.BECOME_MASTER_INITIAL;
119124
var desiredBalance1 = desiredBalanceComputer.compute(desiredBalance0, createInput(clusterState), queue(), input -> false);
120125
assertDesiredAssignments(
121126
desiredBalance1,
@@ -147,7 +152,7 @@ public void testIgnoresOutOfScopePrimaries() {
147152
var primaryShard = mutateAllocationStatus(clusterState.routingTable().index(TEST_INDEX).shard(0).primaryShard());
148153

149154
var desiredBalance = desiredBalanceComputer.compute(
150-
DesiredBalance.INITIAL,
155+
DesiredBalance.BECOME_MASTER_INITIAL,
151156
createInput(clusterState, primaryShard),
152157
queue(),
153158
input -> true
@@ -184,7 +189,7 @@ public void testIgnoresOutOfScopeReplicas() {
184189
var replicaShard = mutateAllocationStatus(originalReplicaShard);
185190

186191
var desiredBalance = desiredBalanceComputer.compute(
187-
DesiredBalance.INITIAL,
192+
DesiredBalance.BECOME_MASTER_INITIAL,
188193
createInput(clusterState, replicaShard),
189194
queue(),
190195
input -> true
@@ -241,7 +246,7 @@ public void testAssignShardsToTheirPreviousLocationIfAvailable() {
241246
: new ShardRouting[] { clusterState.routingTable().index(TEST_INDEX).shard(0).primaryShard() };
242247

243248
var desiredBalance = desiredBalanceComputer.compute(
244-
DesiredBalance.INITIAL,
249+
DesiredBalance.BECOME_MASTER_INITIAL,
245250
createInput(clusterState, ignored),
246251
queue(),
247252
input -> true
@@ -284,7 +289,12 @@ public void testRespectsAssignmentOfUnknownPrimaries() {
284289
}
285290
clusterState = ClusterState.builder(clusterState).routingTable(RoutingTable.of(routingNodes)).build();
286291

287-
var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true);
292+
var desiredBalance = desiredBalanceComputer.compute(
293+
DesiredBalance.BECOME_MASTER_INITIAL,
294+
createInput(clusterState),
295+
queue(),
296+
input -> true
297+
);
288298

289299
assertDesiredAssignments(
290300
desiredBalance,
@@ -331,7 +341,12 @@ public void testRespectsAssignmentOfUnknownReplicas() {
331341
}
332342
clusterState = ClusterState.builder(clusterState).routingTable(RoutingTable.of(routingNodes)).build();
333343

334-
var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true);
344+
var desiredBalance = desiredBalanceComputer.compute(
345+
DesiredBalance.BECOME_MASTER_INITIAL,
346+
createInput(clusterState),
347+
queue(),
348+
input -> true
349+
);
335350

336351
assertDesiredAssignments(
337352
desiredBalance,
@@ -367,7 +382,7 @@ public void testRespectsAssignmentByGatewayAllocators() {
367382
}
368383

369384
var desiredBalance = desiredBalanceComputer.compute(
370-
DesiredBalance.INITIAL,
385+
DesiredBalance.BECOME_MASTER_INITIAL,
371386
DesiredBalanceInput.create(randomNonNegativeLong(), routingAllocation),
372387
queue(),
373388
input -> true
@@ -427,7 +442,12 @@ public ShardAllocationDecision decideShardAllocation(ShardRouting shard, Routing
427442
}
428443
clusterState = ClusterState.builder(clusterState).routingTable(RoutingTable.of(desiredRoutingNodes)).build();
429444

430-
var desiredBalance1 = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true);
445+
var desiredBalance1 = desiredBalanceComputer.compute(
446+
DesiredBalance.BECOME_MASTER_INITIAL,
447+
createInput(clusterState),
448+
queue(),
449+
input -> true
450+
);
431451
assertDesiredAssignments(
432452
desiredBalance1,
433453
Map.of(
@@ -513,7 +533,12 @@ public void testNoDataNodes() {
513533
var desiredBalanceComputer = createDesiredBalanceComputer();
514534
var clusterState = createInitialClusterState(0);
515535

516-
var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true);
536+
var desiredBalance = desiredBalanceComputer.compute(
537+
DesiredBalance.BECOME_MASTER_INITIAL,
538+
createInput(clusterState),
539+
queue(),
540+
input -> true
541+
);
517542

518543
assertDesiredAssignments(desiredBalance, Map.of());
519544
}
@@ -532,7 +557,7 @@ public void testAppliesMoveCommands() {
532557
clusterState = ClusterState.builder(clusterState).routingTable(RoutingTable.of(routingNodes)).build();
533558

534559
var desiredBalance = desiredBalanceComputer.compute(
535-
DesiredBalance.INITIAL,
560+
DesiredBalance.BECOME_MASTER_INITIAL,
536561
createInput(clusterState),
537562
queue(
538563
new MoveAllocationCommand(index.getName(), 0, "node-1", "node-2"),
@@ -662,7 +687,7 @@ public void testDesiredBalanceShouldConvergeInABigCluster() {
662687

663688
var input = new DesiredBalanceInput(randomInt(), routingAllocationWithDecidersOf(clusterState, clusterInfo, settings), List.of());
664689
var desiredBalance = createDesiredBalanceComputer(new BalancedShardsAllocator(settings)).compute(
665-
DesiredBalance.INITIAL,
690+
DesiredBalance.BECOME_MASTER_INITIAL,
666691
input,
667692
queue(),
668693
ignored -> iteration.incrementAndGet() < 1000
@@ -1243,7 +1268,7 @@ public ShardAllocationDecision decideShardAllocation(ShardRouting shard, Routing
12431268
assertThatLogger(() -> {
12441269
var iteration = new AtomicInteger(0);
12451270
desiredBalanceComputer.compute(
1246-
DesiredBalance.INITIAL,
1271+
DesiredBalance.BECOME_MASTER_INITIAL,
12471272
createInput(createInitialClusterState(3)),
12481273
queue(),
12491274
input -> iteration.incrementAndGet() < iterations

0 commit comments

Comments
 (0)