Skip to content

Commit 4109412

Browse files
committed
Pre-compute hasAssignedQueuedShards
Optimize when to kickoff starting assigned queued shards. Fix allocation filter permit release
1 parent 292649d commit 4109412

File tree

4 files changed

+119
-59
lines changed

4 files changed

+119
-59
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStressTestsIT.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,6 +1398,10 @@ private void pollForAllocationFilterCompletion(
13981398
Releasable onCompletion,
13991399
Runnable onSuccess
14001400
) {
1401+
if (shouldStop.get()) {
1402+
Releasables.close(onCompletion);
1403+
return;
1404+
}
14011405
clientExecutor.execute(mustSucceed(() -> {
14021406
final var clusterService = internalCluster().getCurrentMasterNodeInstance(ClusterService.class);
14031407
final ClusterState state = clusterService.state();

server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,12 @@ public static class Entry implements Writeable, ToXContentObject, RepositoryOper
913913
*/
914914
private final boolean hasShardsInInitState;
915915

916+
/**
917+
* Flag set to true in case any of the shard snapshots in {@link #shards} are {@link ShardSnapshotStatus#isAssignedQueued}.
918+
* This is used to avoid having to iterate the full {@link #shards} map.
919+
*/
920+
private final boolean hasAssignedQueuedShards;
921+
916922
// visible for testing, use #startedEntry and copy constructors in production code
917923
public static Entry snapshot(
918924
Snapshot snapshot,
@@ -932,6 +938,7 @@ public static Entry snapshot(
932938
final Map<String, Index> res = Maps.newMapWithExpectedSize(indices.size());
933939
final Map<RepositoryShardId, ShardSnapshotStatus> byRepoShardIdBuilder = Maps.newHashMapWithExpectedSize(shards.size());
934940
boolean hasInitStateShards = false;
941+
boolean hasAssignedQueuedShards = false;
935942
for (Map.Entry<ShardId, ShardSnapshotStatus> entry : shards.entrySet()) {
936943
final ShardId shardId = entry.getKey();
937944
final IndexId indexId = indices.get(shardId.getIndexName());
@@ -940,6 +947,7 @@ public static Entry snapshot(
940947
assert existing == null || existing.equals(index) : "Conflicting indices [" + existing + "] and [" + index + "]";
941948
final var shardSnapshotStatus = entry.getValue();
942949
hasInitStateShards |= shardSnapshotStatus.state() == ShardState.INIT;
950+
hasAssignedQueuedShards |= shardSnapshotStatus.isAssignedQueued();
943951
byRepoShardIdBuilder.put(new RepositoryShardId(indexId, shardId.id()), shardSnapshotStatus);
944952
}
945953
return new Entry(
@@ -959,7 +967,8 @@ public static Entry snapshot(
959967
null,
960968
byRepoShardIdBuilder,
961969
res,
962-
hasInitStateShards
970+
hasInitStateShards,
971+
hasAssignedQueuedShards
963972
);
964973
}
965974

@@ -991,6 +1000,7 @@ private static Entry createClone(
9911000
source,
9921001
shardStatusByRepoShardId,
9931002
Map.of(),
1003+
false,
9941004
false
9951005
);
9961006
}
@@ -1012,7 +1022,8 @@ private Entry(
10121022
@Nullable SnapshotId source,
10131023
Map<RepositoryShardId, ShardSnapshotStatus> shardStatusByRepoShardId,
10141024
Map<String, Index> snapshotIndices,
1015-
boolean hasShardsInInitState
1025+
boolean hasShardsInInitState,
1026+
boolean hasAssignedQueuedShards
10161027
) {
10171028
this.state = state;
10181029
this.snapshot = snapshot;
@@ -1031,13 +1042,15 @@ private Entry(
10311042
this.shardStatusByRepoShardId = Map.copyOf(shardStatusByRepoShardId);
10321043
this.snapshotIndices = snapshotIndices;
10331044
this.hasShardsInInitState = hasShardsInInitState;
1045+
this.hasAssignedQueuedShards = hasAssignedQueuedShards;
10341046
assert assertShardsConsistent(
10351047
this.source,
10361048
this.state,
10371049
this.indices,
10381050
this.shards,
10391051
this.shardStatusByRepoShardId,
1040-
this.hasShardsInInitState
1052+
this.hasShardsInInitState,
1053+
this.hasAssignedQueuedShards
10411054
);
10421055
}
10431056

@@ -1087,14 +1100,18 @@ private static boolean assertShardsConsistent(
10871100
Map<String, IndexId> indices,
10881101
Map<ShardId, ShardSnapshotStatus> shards,
10891102
Map<RepositoryShardId, ShardSnapshotStatus> statusByRepoShardId,
1090-
boolean hasInitStateShards
1103+
boolean hasInitStateShards,
1104+
boolean hasAssignedQueuedShards
10911105
) {
10921106
if ((state == State.INIT || state == State.ABORTED) && shards.isEmpty()) {
10931107
return true;
10941108
}
10951109
if (hasInitStateShards) {
10961110
assert state == State.STARTED : "shouldn't have INIT-state shards in state " + state;
10971111
}
1112+
if (hasAssignedQueuedShards) {
1113+
assert source == null : "clone entry must not have any shards in assigned-queued state";
1114+
}
10981115
final Set<String> indexNames = indices.keySet();
10991116
final Set<String> indexNamesInShards = new HashSet<>();
11001117
shards.entrySet().forEach(s -> {
@@ -1150,7 +1167,8 @@ public Entry withRepoGen(long newRepoGen) {
11501167
source,
11511168
shardStatusByRepoShardId,
11521169
snapshotIndices,
1153-
hasShardsInInitState
1170+
hasShardsInInitState,
1171+
hasAssignedQueuedShards
11541172
);
11551173
}
11561174

@@ -1403,6 +1421,15 @@ public boolean hasShardsInInitState() {
14031421
return hasShardsInInitState;
14041422
}
14051423

1424+
/**
1425+
* See {@link #hasAssignedQueuedShards}.
1426+
*/
1427+
public boolean hasAssignedQueuedShards() {
1428+
assert hasAssignedQueuedShards == false || isClone() == false
1429+
: "a clone entry must not have assigned-queued shards, but saw " + this;
1430+
return hasAssignedQueuedShards;
1431+
}
1432+
14061433
public boolean partial() {
14071434
return partial;
14081435
}
@@ -1784,7 +1811,8 @@ public Entry apply(Entry part) {
17841811
null,
17851812
part.shardStatusByRepoShardId,
17861813
part.snapshotIndices,
1787-
part.hasShardsInInitState
1814+
part.hasShardsInInitState,
1815+
part.hasAssignedQueuedShards
17881816
);
17891817
}
17901818
if (part.isClone()) {

server/src/main/java/org/elasticsearch/snapshots/PerNodeShardSnapshotCounter.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,25 @@ public boolean hasCapacityOnAnyNode() {
9595
return enabled() == false || perNodeCounts.values().stream().anyMatch(count -> count < shardSnapshotPerNodeLimit);
9696
}
9797

98+
public boolean hasCapacityOnNode(String nodeId) {
99+
if (enabled() == false) {
100+
return true;
101+
}
102+
final Integer count = perNodeCounts.get(nodeId);
103+
return count != null && count < shardSnapshotPerNodeLimit;
104+
}
105+
98106
private boolean enabled() {
99107
return shardSnapshotPerNodeLimit > 0;
100108
}
109+
110+
@Override
111+
public String toString() {
112+
return "PerNodeShardSnapshotCounter{"
113+
+ "shardSnapshotPerNodeLimit="
114+
+ shardSnapshotPerNodeLimit
115+
+ ", perNodeCounts="
116+
+ perNodeCounts
117+
+ '}';
118+
}
101119
}

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 63 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,7 +1342,6 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar
13421342
shards.put(shardId, failedState);
13431343
knownFailures.put(shardSnapshotEntry.getKey(), failedState);
13441344
} else if (shardStatus.state().completed() == false && shardStatus.nodeId() != null) {
1345-
// TODO: This branch applies to assigned-queued shards. It seems OK since it applies to INIT as well. Double check it.
13461345
if (nodes.nodeExists(shardStatus.nodeId())) {
13471346
shards.put(shardId, shardStatus);
13481347
} else {
@@ -3089,16 +3088,16 @@ private static SnapshotsInProgress maybeStartAssignedQueuedShardSnapshotsForRepo
30893088
Runnable changedCallback,
30903089
Runnable startedCallback
30913090
) {
3091+
assert perNodeShardSnapshotCounter.hasCapacityOnAnyNode() : "no capacity left on any node " + perNodeShardSnapshotCounter;
30923092
final List<SnapshotsInProgress.Entry> oldEntries = snapshotsInProgress.forRepo(projectRepo);
3093-
if (oldEntries.isEmpty()) {
3093+
if (oldEntries.isEmpty() || oldEntries.stream().allMatch(entry -> entry.hasAssignedQueuedShards() == false)) {
30943094
return snapshotsInProgress;
30953095
}
30963096
final List<SnapshotsInProgress.Entry> newEntries = new ArrayList<>(oldEntries.size());
30973097
for (SnapshotsInProgress.Entry entry : oldEntries) {
3098-
if (entry.isClone() == false && perNodeShardSnapshotCounter.hasCapacityOnAnyNode()) {
3099-
// TODO: Optimize this by checking whether the entry has any assigned-queued shards before building the shards map
3098+
if (entry.hasAssignedQueuedShards() && perNodeShardSnapshotCounter.hasCapacityOnAnyNode()) {
31003099
final var shardsBuilder = ImmutableOpenMap.builder(entry.shards());
3101-
maybeStartAssignedQueuedShardSnapshots(
3100+
final var changed = maybeStartAssignedQueuedShardSnapshots(
31023101
clusterState,
31033102
entry,
31043103
snapshotsInProgress::isNodeIdForRemoval,
@@ -3108,10 +3107,12 @@ private static SnapshotsInProgress maybeStartAssignedQueuedShardSnapshotsForRepo
31083107
changedCallback,
31093108
startedCallback
31103109
);
3111-
final var newEntry = entry.withShardStates(shardsBuilder.build());
3112-
newEntries.add(newEntry);
3113-
if (newEntry != entry) {
3110+
if (changed) {
3111+
final var newEntry = entry.withShardStates(shardsBuilder.build());
3112+
newEntries.add(newEntry);
31143113
newEntryConsumer.accept(newEntry);
3114+
} else {
3115+
newEntries.add(entry);
31153116
}
31163117
} else {
31173118
newEntries.add(entry);
@@ -3120,7 +3121,7 @@ private static SnapshotsInProgress maybeStartAssignedQueuedShardSnapshotsForRepo
31203121
return snapshotsInProgress.createCopyWithUpdatedEntriesForRepo(projectRepo.projectId(), projectRepo.name(), newEntries);
31213122
}
31223123

3123-
private static void maybeStartAssignedQueuedShardSnapshots(
3124+
private static boolean maybeStartAssignedQueuedShardSnapshots(
31243125
ClusterState clusterState,
31253126
SnapshotsInProgress.Entry entry,
31263127
Predicate<String> nodeIdRemovalPredicate,
@@ -3130,41 +3131,46 @@ private static void maybeStartAssignedQueuedShardSnapshots(
31303131
Runnable changedCallback,
31313132
Runnable startedCallback
31323133
) {
3133-
if (perNodeShardSnapshotCounter.hasCapacityOnAnyNode()) {
3134-
for (var shardId : shardsBuilder.keys()) {
3135-
final var existingShardSnapshotStatus = shardsBuilder.get(shardId);
3136-
if (existingShardSnapshotStatus.isAssignedQueued() == false) {
3137-
continue;
3138-
}
3139-
final IndexRoutingTable indexRouting = clusterState.routingTable(entry.projectId()).index(shardId.getIndex());
3140-
final ShardRouting shardRouting;
3141-
if (indexRouting == null) {
3142-
shardRouting = null;
3143-
} else {
3144-
shardRouting = indexRouting.shard(shardId.id()).primaryShard();
3145-
}
3146-
final var newShardSnapshotStatus = initShardSnapshotStatus(
3147-
existingShardSnapshotStatus.generation(),
3148-
shardRouting,
3149-
nodeIdRemovalPredicate,
3150-
perNodeShardSnapshotCounter
3151-
);
3152-
if (newShardSnapshotStatus.state().completed()) {
3153-
// It can become complete if the shard is unassigned or deleted, i.e. state == MISSING.
3154-
// We cannot directly update its status here because there maybe another snapshot for
3155-
// the same shard that is QUEUED which must be updated as well, i.e. vertical update.
3156-
// So we submit the status update to let it be processed in a future cluster state update.
3157-
shardStatusUpdateConsumer.apply(entry.snapshot(), shardId, newShardSnapshotStatus);
3158-
continue;
3159-
} else if (newShardSnapshotStatus.equals(existingShardSnapshotStatus) == false) {
3160-
changedCallback.run();
3161-
if (newShardSnapshotStatus.state() == ShardState.INIT) {
3162-
startedCallback.run();
3163-
}
3134+
assert entry.hasAssignedQueuedShards() : "entry has no assigned queued shards: " + entry;
3135+
assert perNodeShardSnapshotCounter.hasCapacityOnAnyNode() : "no capacity left on any node " + perNodeShardSnapshotCounter;
3136+
boolean changed = false;
3137+
for (var shardId : shardsBuilder.keys()) {
3138+
if (perNodeShardSnapshotCounter.hasCapacityOnAnyNode() == false) {
3139+
return changed;
3140+
}
3141+
final var existingShardSnapshotStatus = shardsBuilder.get(shardId);
3142+
if (existingShardSnapshotStatus.isAssignedQueued() == false) {
3143+
continue;
3144+
}
3145+
final IndexRoutingTable indexRouting = clusterState.routingTable(entry.projectId()).index(shardId.getIndex());
3146+
final ShardRouting shardRouting;
3147+
if (indexRouting == null) {
3148+
shardRouting = null;
3149+
} else {
3150+
shardRouting = indexRouting.shard(shardId.id()).primaryShard();
3151+
}
3152+
final var newShardSnapshotStatus = initShardSnapshotStatus(
3153+
existingShardSnapshotStatus.generation(),
3154+
shardRouting,
3155+
nodeIdRemovalPredicate,
3156+
perNodeShardSnapshotCounter
3157+
);
3158+
if (newShardSnapshotStatus.state().completed()) {
3159+
// It can become complete if the shard is unassigned or deleted, i.e. state == MISSING.
3160+
// We cannot directly update its status here because there maybe another snapshot for
3161+
// the same shard that is QUEUED which must be updated as well, i.e. vertical update.
3162+
// So we submit the status update to let it be processed in a future cluster state update.
3163+
shardStatusUpdateConsumer.apply(entry.snapshot(), shardId, newShardSnapshotStatus);
3164+
} else if (newShardSnapshotStatus.equals(existingShardSnapshotStatus) == false) {
3165+
changedCallback.run();
3166+
if (newShardSnapshotStatus.state() == ShardState.INIT) {
3167+
startedCallback.run();
31643168
}
31653169
shardsBuilder.put(shardId, newShardSnapshotStatus);
3170+
changed = true;
31663171
}
31673172
}
3173+
return changed;
31683174
}
31693175

31703176
/**
@@ -3555,7 +3561,7 @@ SnapshotsInProgress computeUpdatedState() {
35553561
// due to snapshots running for a repository that is update to completion in this batch.
35563562
// (2) Repos that have seen updates in this batch because updates releasing capacity may all belong to later snapshots
35573563
// than the one has assigned-queued shards. These updates could be either for the same repo or a different repo.
3558-
for (var repo : existing.repos()) {
3564+
for (var repo : updated.repos()) {
35593565
if (perNodeShardSnapshotCounter.hasCapacityOnAnyNode() == false) {
35603566
break;
35613567
}
@@ -3690,18 +3696,22 @@ SnapshotsInProgress.Entry computeUpdatedSnapshotEntryFromShardUpdates() {
36903696
+ " as well as "
36913697
+ shardsBuilder;
36923698

3693-
// Shard snapshots changed status for this entry, check within the snapshot to see whether any previously limited
3694-
// shard snapshots can now start due to newly completed ones.
3695-
maybeStartAssignedQueuedShardSnapshots(
3696-
initialState,
3697-
entry,
3698-
nodeIdRemovalPredicate,
3699-
shardsBuilder,
3700-
perNodeShardSnapshotCounter,
3701-
shardStatusUpdateConsumer,
3702-
() -> changedCount++,
3703-
() -> startedCount++
3704-
);
3699+
if (entry.hasAssignedQueuedShards() && perNodeShardSnapshotCounter.hasCapacityOnAnyNode()) {
3700+
// Shard snapshots changed status for this entry, check within the snapshot to see whether any previously limited
3701+
// shard snapshots can now start due to newly completed ones. This is only necessary if the entry has any
3702+
// assigned-queued shards before the update. If the entry gets any new assigned-queued shards from processing the
3703+
// update, they cannot be started anyway because they already reflect the latest node capacities.
3704+
maybeStartAssignedQueuedShardSnapshots(
3705+
initialState,
3706+
entry,
3707+
nodeIdRemovalPredicate,
3708+
shardsBuilder,
3709+
perNodeShardSnapshotCounter,
3710+
shardStatusUpdateConsumer,
3711+
() -> changedCount++,
3712+
() -> startedCount++
3713+
);
3714+
}
37053715
return entry.withShardStates(shardsBuilder.build());
37063716
} else if (clonesBuilder != null) {
37073717
return entry.withClones(clonesBuilder.build());

0 commit comments

Comments
 (0)