Skip to content

Commit 00c9943

Browse files
authored
Ensure tasks preserve versions in MasterService (#109850) (#109900)
`ClusterState#version`, `Metadata#version` and `RoutingTable#version` are all managed solely by the `MasterService`, in the sense that it's a definite bug for the cluster state update task executor to meddle with them. Today if we encounter such a bug then we try and publish the resulting state anyway, which hopefully fails (triggering a master election) but it may in theory succeed (potentially reverting older cluster state updates). Neither is a particularly good outcome. With this commit we add a check for consistency of these version numbers during the cluster state computation and fail the state update without a master failover if a discrepancy is found. It also fixes a super-subtle bug in `TransportMigrateToDataTiersAction` that can muck up these version numbers.
1 parent 5c4614b commit 00c9943

File tree

5 files changed

+121
-9
lines changed

5 files changed

+121
-9
lines changed

docs/changelog/109850.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 109850
2+
summary: Ensure tasks preserve versions in `MasterService`
3+
area: Cluster Coordination
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/cluster/service/MasterService.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,24 @@ public Builder incrementVersion(ClusterState clusterState) {
523523
return ClusterState.builder(clusterState).incrementVersion();
524524
}
525525

526+
private static boolean versionNumbersPreserved(ClusterState oldState, ClusterState newState) {
527+
if (oldState.nodes().getMasterNodeId() == null && newState.nodes().getMasterNodeId() != null) {
528+
return true; // NodeJoinExecutor is special, we trust it to do the right thing with versions
529+
}
530+
531+
if (oldState.version() != newState.version()) {
532+
return false;
533+
}
534+
if (oldState.metadata().version() != newState.metadata().version()) {
535+
return false;
536+
}
537+
if (oldState.routingTable().version() != newState.routingTable().version()) {
538+
// GatewayService is special and for odd legacy reasons gets to do this:
539+
return oldState.clusterRecovered() == false && newState.clusterRecovered() && newState.routingTable().version() == 0;
540+
}
541+
return true;
542+
}
543+
526544
/**
527545
* Submits an unbatched cluster state update task. This method exists for legacy reasons but is deprecated and forbidden in new
528546
* production code because unbatched tasks are a source of performance and stability bugs. You should instead implement your update
@@ -1024,6 +1042,8 @@ private static <T extends ClusterStateTaskListener> boolean assertAllTasksComple
10241042
return true;
10251043
}
10261044

1045+
static final String TEST_ONLY_EXECUTOR_MAY_CHANGE_VERSION_NUMBER_TRANSIENT_NAME = "test_only_executor_may_change_version_number";
1046+
10271047
private static <T extends ClusterStateTaskListener> ClusterState innerExecuteTasks(
10281048
ClusterState previousClusterState,
10291049
List<ExecutionResult<T>> executionResults,
@@ -1036,13 +1056,23 @@ private static <T extends ClusterStateTaskListener> ClusterState innerExecuteTas
10361056
// to avoid leaking headers in production that were missed by tests
10371057

10381058
try {
1039-
return executor.execute(
1059+
final var updatedState = executor.execute(
10401060
new ClusterStateTaskExecutor.BatchExecutionContext<>(
10411061
previousClusterState,
10421062
executionResults,
10431063
threadContext::newStoredContext
10441064
)
10451065
);
1066+
if (versionNumbersPreserved(previousClusterState, updatedState) == false) {
1067+
// Shenanigans! Executors mustn't meddle with version numbers. Perhaps the executor based its update on the wrong
1068+
// initial state, potentially losing an intervening cluster state update. That'd be very bad!
1069+
final var exception = new IllegalStateException(
1070+
"cluster state update executor did not preserve version numbers: [" + summary.toString() + "]"
1071+
);
1072+
assert threadContext.getTransient(TEST_ONLY_EXECUTOR_MAY_CHANGE_VERSION_NUMBER_TRANSIENT_NAME) != null : exception;
1073+
throw exception;
1074+
}
1075+
return updatedState;
10461076
} catch (Exception e) {
10471077
logger.trace(
10481078
() -> format(

server/src/test/java/org/elasticsearch/cluster/service/MasterServiceTests.java

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.elasticsearch.cluster.node.DiscoveryNode;
3434
import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
3535
import org.elasticsearch.cluster.node.DiscoveryNodes;
36+
import org.elasticsearch.cluster.routing.RoutingTable;
3637
import org.elasticsearch.common.Priority;
3738
import org.elasticsearch.common.Randomness;
3839
import org.elasticsearch.common.component.Lifecycle;
@@ -77,6 +78,7 @@
7778
import java.util.concurrent.atomic.AtomicBoolean;
7879
import java.util.concurrent.atomic.AtomicInteger;
7980
import java.util.concurrent.atomic.AtomicReference;
81+
import java.util.function.UnaryOperator;
8082
import java.util.stream.Collectors;
8183

8284
import static java.util.Collections.emptySet;
@@ -92,6 +94,7 @@
9294
import static org.hamcrest.Matchers.hasSize;
9395
import static org.hamcrest.Matchers.instanceOf;
9496
import static org.hamcrest.Matchers.lessThanOrEqualTo;
97+
import static org.hamcrest.Matchers.startsWith;
9598

9699
public class MasterServiceTests extends ESTestCase {
97100

@@ -497,7 +500,7 @@ public void onFailure(Exception e) {}
497500
@Override
498501
public ClusterState execute(ClusterState currentState) {
499502
relativeTimeInMillis += TimeValue.timeValueSeconds(3).millis();
500-
return ClusterState.builder(currentState).incrementVersion().build();
503+
return ClusterState.builder(currentState).build();
501504
}
502505

503506
@Override
@@ -1250,7 +1253,7 @@ public void onFailure(Exception e) {
12501253
public ClusterState execute(ClusterState currentState) {
12511254
relativeTimeInMillis += MasterService.MASTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(Settings.EMPTY).millis()
12521255
+ randomLongBetween(1, 1000000);
1253-
return ClusterState.builder(currentState).incrementVersion().build();
1256+
return ClusterState.builder(currentState).build();
12541257
}
12551258

12561259
@Override
@@ -1284,7 +1287,7 @@ public void onFailure(Exception e) {
12841287
masterService.submitUnbatchedStateUpdateTask("test5", new ClusterStateUpdateTask() {
12851288
@Override
12861289
public ClusterState execute(ClusterState currentState) {
1287-
return ClusterState.builder(currentState).incrementVersion().build();
1290+
return ClusterState.builder(currentState).build();
12881291
}
12891292

12901293
@Override
@@ -1300,7 +1303,7 @@ public void onFailure(Exception e) {
13001303
masterService.submitUnbatchedStateUpdateTask("test6", new ClusterStateUpdateTask() {
13011304
@Override
13021305
public ClusterState execute(ClusterState currentState) {
1303-
return ClusterState.builder(currentState).incrementVersion().build();
1306+
return ClusterState.builder(currentState).build();
13041307
}
13051308

13061309
@Override
@@ -2545,6 +2548,69 @@ public void onFailure(Exception e) {
25452548
}
25462549
}
25472550

2551+
public void testVersionNumberProtection() {
2552+
runVersionNumberProtectionTest(
2553+
currentState -> ClusterState.builder(currentState)
2554+
.version(randomFrom(currentState.version() - 1, currentState.version() + 1))
2555+
.build()
2556+
);
2557+
2558+
runVersionNumberProtectionTest(
2559+
currentState -> currentState.copyAndUpdateMetadata(
2560+
b -> b.version(randomFrom(currentState.metadata().version() - 1, currentState.metadata().version() + 1))
2561+
)
2562+
);
2563+
2564+
runVersionNumberProtectionTest(
2565+
currentState -> ClusterState.builder(currentState)
2566+
.routingTable(
2567+
RoutingTable.builder(currentState.routingTable())
2568+
.version(randomFrom(currentState.routingTable().version() - 1, currentState.routingTable().version() + 1))
2569+
.build()
2570+
)
2571+
.build()
2572+
);
2573+
}
2574+
2575+
private void runVersionNumberProtectionTest(UnaryOperator<ClusterState> updateOperator) {
2576+
final var deterministicTaskQueue = new DeterministicTaskQueue();
2577+
final var threadPool = deterministicTaskQueue.getThreadPool();
2578+
final var threadContext = threadPool.getThreadContext();
2579+
final var failureCaught = new AtomicBoolean();
2580+
2581+
try (
2582+
var masterService = createMasterService(true, null, threadPool, deterministicTaskQueue.getPrioritizedEsThreadPoolExecutor());
2583+
var ignored = threadContext.stashContext()
2584+
) {
2585+
final var taskId = randomIdentifier();
2586+
2587+
masterService.submitUnbatchedStateUpdateTask(taskId, new ClusterStateUpdateTask() {
2588+
@Override
2589+
public ClusterState execute(ClusterState currentState) {
2590+
return updateOperator.apply(currentState);
2591+
}
2592+
2593+
@Override
2594+
public void onFailure(Exception e) {
2595+
assertThat(
2596+
asInstanceOf(IllegalStateException.class, e).getMessage(),
2597+
allOf(startsWith("cluster state update executor did not preserve version numbers"), containsString(taskId))
2598+
);
2599+
assertTrue(failureCaught.compareAndSet(false, true));
2600+
}
2601+
});
2602+
2603+
// suppress assertion errors to check production behaviour
2604+
threadContext.putTransient(MasterService.TEST_ONLY_EXECUTOR_MAY_CHANGE_VERSION_NUMBER_TRANSIENT_NAME, new Object());
2605+
threadContext.markAsSystemContext();
2606+
deterministicTaskQueue.runAllRunnableTasks();
2607+
assertFalse(deterministicTaskQueue.hasRunnableTasks());
2608+
assertFalse(deterministicTaskQueue.hasDeferredTasks());
2609+
2610+
assertTrue(failureCaught.get());
2611+
}
2612+
}
2613+
25482614
/**
25492615
* Returns the cluster state that the master service uses (and that is provided by the discovery layer)
25502616
*/

server/src/test/java/org/elasticsearch/gateway/GatewayServiceTests.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
2121
import org.elasticsearch.cluster.node.DiscoveryNodes;
2222
import org.elasticsearch.cluster.routing.RerouteService;
23+
import org.elasticsearch.cluster.routing.RoutingTable;
2324
import org.elasticsearch.cluster.routing.ShardRoutingRoleStrategy;
2425
import org.elasticsearch.cluster.service.ClusterApplierService;
2526
import org.elasticsearch.cluster.service.ClusterService;
@@ -494,12 +495,22 @@ public void onFailure(Exception e) {
494495

495496
private MasterServiceTaskQueue<SetClusterStateTask> createSetClusterStateTaskQueue(ClusterService clusterService) {
496497
return clusterService.createTaskQueue("set-cluster-state", Priority.NORMAL, batchExecutionContext -> {
497-
ClusterState targetState = batchExecutionContext.initialState();
498+
final var initialState = batchExecutionContext.initialState();
499+
var targetState = initialState;
498500
for (var taskContext : batchExecutionContext.taskContexts()) {
499501
targetState = taskContext.getTask().clusterState();
500502
taskContext.success(() -> {});
501503
}
502-
return targetState;
504+
// fix up the version numbers
505+
final var finalStateBuilder = ClusterState.builder(targetState)
506+
.version(initialState.version())
507+
.metadata(Metadata.builder(targetState.metadata()).version(initialState.metadata().version()));
508+
if (initialState.clusterRecovered() || targetState.clusterRecovered() == false) {
509+
finalStateBuilder.routingTable(
510+
RoutingTable.builder(targetState.routingTable()).version(initialState.routingTable().version())
511+
);
512+
}
513+
return finalStateBuilder.build();
503514
});
504515
}
505516
}

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/action/TransportMigrateToDataTiersAction.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,9 @@ protected void masterOperation(
123123
final SetOnce<MigratedEntities> migratedEntities = new SetOnce<>();
124124
submitUnbatchedTask("migrate-to-data-tiers []", new ClusterStateUpdateTask(Priority.HIGH) {
125125
@Override
126-
public ClusterState execute(ClusterState currentState) throws Exception {
126+
public ClusterState execute(ClusterState currentState) {
127127
Tuple<ClusterState, MigratedEntities> migratedEntitiesTuple = migrateToDataTiersRouting(
128-
state,
128+
currentState,
129129
request.getNodeAttributeName(),
130130
request.getLegacyTemplateToDelete(),
131131
xContentRegistry,

0 commit comments

Comments
 (0)