-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Add split recovery source #124834
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add split recovery source #124834
Changes from all commits
4ee90b9
0927aab
33d43a0
9c8ee34
b074858
2adadff
68a6ea3
eff3f69
f8c0ffd
739ee52
b3eb70e
19514be
e5300c0
f9ed927
96cec40
bba78b0
89f5b43
ea3b3c8
403f146
e72afa5
ecced4f
8103712
28353a4
2008217
9578b16
f157bfd
c68c056
e09baf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,8 @@ | |
| import org.elasticsearch.cluster.NotMasterException; | ||
| import org.elasticsearch.cluster.coordination.FailedToCommitClusterStateException; | ||
| import org.elasticsearch.cluster.metadata.IndexMetadata; | ||
| import org.elasticsearch.cluster.metadata.IndexReshardingMetadata; | ||
| import org.elasticsearch.cluster.metadata.IndexReshardingState; | ||
| import org.elasticsearch.cluster.metadata.Metadata; | ||
| import org.elasticsearch.cluster.metadata.ProjectId; | ||
| import org.elasticsearch.cluster.metadata.ProjectMetadata; | ||
|
|
@@ -41,6 +43,7 @@ | |
| import org.elasticsearch.common.Strings; | ||
| import org.elasticsearch.common.io.stream.StreamInput; | ||
| import org.elasticsearch.common.io.stream.StreamOutput; | ||
| import org.elasticsearch.common.io.stream.Writeable; | ||
| import org.elasticsearch.common.util.concurrent.EsExecutors; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.core.TimeValue; | ||
|
|
@@ -580,6 +583,31 @@ public void shardStarted( | |
| ); | ||
| } | ||
|
|
||
| public void shardSplit( | ||
| final ShardRouting shardRouting, | ||
| final long primaryTerm, | ||
| final String message, | ||
| final ShardLongFieldRange timestampRange, | ||
| final ShardLongFieldRange eventIngestedRange, | ||
| final long sourcePrimaryTerm, | ||
| final ActionListener<Void> listener | ||
| ) { | ||
| ClusterState currentState = clusterService.state(); | ||
| remoteShardStateUpdateDeduplicator.executeOnce( | ||
| new StartedShardEntry( | ||
| shardRouting.shardId(), | ||
| shardRouting.allocationId().getId(), | ||
| primaryTerm, | ||
| message, | ||
| timestampRange, | ||
| eventIngestedRange, | ||
| new ShardSplit(sourcePrimaryTerm) | ||
| ), | ||
| listener, | ||
| (req, l) -> sendShardAction(SHARD_STARTED_ACTION_NAME, currentState, req, l) | ||
| ); | ||
| } | ||
|
|
||
| // TODO: Make this a TransportMasterNodeAction and remove duplication of master failover retrying from upstream code | ||
| private static class ShardStartedTransportHandler implements TransportRequestHandler<StartedShardEntry> { | ||
| private final MasterServiceTaskQueue<StartedShardUpdateTask> taskQueue; | ||
|
|
@@ -691,6 +719,12 @@ public ClusterState execute(BatchExecutionContext<StartedShardUpdateTask> batchE | |
| matched | ||
| ); | ||
| tasksToBeApplied.add(taskContext); | ||
| } else if (invalidShardSplit(startedShardEntry, projectId, initialState)) { | ||
| logger.debug("{} failing shard started task because split validation failed", startedShardEntry.shardId); | ||
| // TODO: Currently invalid shard split triggers if the primary term changes, the source primary term changes or | ||
| // is >= the target primary term or if the source is relocating. In the second and third scenario this will be | ||
| // swallow currently. In the split process we will need to handle this. | ||
| taskContext.success(() -> task.onFailure(new IllegalStateException("Cannot start"))); | ||
| } else { | ||
| logger.debug( | ||
| "{} starting shard {} (shard started task: [{}])", | ||
|
|
@@ -789,6 +823,31 @@ public ClusterState execute(BatchExecutionContext<StartedShardUpdateTask> batchE | |
| return maybeUpdatedState; | ||
| } | ||
|
|
||
| private static boolean invalidShardSplit(StartedShardEntry startedShardEntry, ProjectId projectId, ClusterState clusterState) { | ||
| ShardSplit shardSplit = startedShardEntry.shardSplit; | ||
| if (shardSplit == null) { | ||
| return false; | ||
| } | ||
| IndexRoutingTable routingTable = clusterState.routingTable(projectId).index(startedShardEntry.shardId.getIndex()); | ||
| final IndexMetadata indexMetadata = clusterState.metadata().getProject(projectId).index(startedShardEntry.shardId.getIndex()); | ||
| assert indexMetadata != null; | ||
| IndexReshardingMetadata reshardingMetadata = indexMetadata.getReshardingMetadata(); | ||
| assert reshardingMetadata != null; | ||
| IndexReshardingState.Split split = reshardingMetadata.getSplit(); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose we could have |
||
| int sourceShardId = startedShardEntry.shardId.getId() % split.shardCountBefore(); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we add a function to |
||
| long currentSourcePrimaryTerm = indexMetadata.primaryTerm(sourceShardId); | ||
| long primaryTermDiff = startedShardEntry.primaryTerm - currentSourcePrimaryTerm; | ||
| // The source primary term must not have changed, the target primary term must at least be equal to or greater and the source | ||
| // cannot be relocating. | ||
| if (startedShardEntry.shardSplit.sourcePrimaryTerm() != currentSourcePrimaryTerm | ||
| || primaryTermDiff < 0 | ||
| || routingTable.shard(sourceShardId).primaryShard().relocating()) { | ||
| return true; | ||
| } else { | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| private static boolean assertStartedIndicesHaveCompleteTimestampRanges(ClusterState clusterState) { | ||
| for (ProjectId projectId : clusterState.metadata().projects().keySet()) { | ||
| for (Map.Entry<String, IndexRoutingTable> cursor : clusterState.routingTable(projectId).getIndicesRouting().entrySet()) { | ||
|
|
@@ -827,13 +886,26 @@ public void clusterStatePublished(ClusterState newClusterState) { | |
| } | ||
| } | ||
|
|
||
| record ShardSplit(long sourcePrimaryTerm) implements Writeable { | ||
|
|
||
| ShardSplit(StreamInput in) throws IOException { | ||
| this(in.readVLong()); | ||
| } | ||
|
|
||
| @Override | ||
| public void writeTo(StreamOutput out) throws IOException { | ||
| out.writeVLong(sourcePrimaryTerm); | ||
| } | ||
| } | ||
|
|
||
| public static class StartedShardEntry extends TransportRequest { | ||
| final ShardId shardId; | ||
| final String allocationId; | ||
| final long primaryTerm; | ||
| final String message; | ||
| final ShardLongFieldRange timestampRange; | ||
| final ShardLongFieldRange eventIngestedRange; | ||
| final ShardSplit shardSplit; | ||
|
|
||
| StartedShardEntry(StreamInput in) throws IOException { | ||
| super(in); | ||
|
|
@@ -847,6 +919,11 @@ public static class StartedShardEntry extends TransportRequest { | |
| } else { | ||
| this.eventIngestedRange = ShardLongFieldRange.UNKNOWN; | ||
| } | ||
| if (in.getTransportVersion().onOrAfter(TransportVersions.SOURCE_PRIMARY_TERM_IN_START_SHARD)) { | ||
| this.shardSplit = in.readOptionalWriteable(ShardSplit::new); | ||
| } else { | ||
| this.shardSplit = null; | ||
| } | ||
| } | ||
|
|
||
| public StartedShardEntry( | ||
|
|
@@ -856,13 +933,26 @@ public StartedShardEntry( | |
| final String message, | ||
| final ShardLongFieldRange timestampRange, | ||
| final ShardLongFieldRange eventIngestedRange | ||
| ) { | ||
| this(shardId, allocationId, primaryTerm, message, timestampRange, eventIngestedRange, null); | ||
| } | ||
|
|
||
| public StartedShardEntry( | ||
| final ShardId shardId, | ||
| final String allocationId, | ||
| final long primaryTerm, | ||
| final String message, | ||
| final ShardLongFieldRange timestampRange, | ||
| final ShardLongFieldRange eventIngestedRange, | ||
| @Nullable final ShardSplit shardSplit | ||
| ) { | ||
| this.shardId = shardId; | ||
| this.allocationId = allocationId; | ||
| this.primaryTerm = primaryTerm; | ||
| this.message = message; | ||
| this.timestampRange = timestampRange; | ||
| this.eventIngestedRange = eventIngestedRange; | ||
| this.shardSplit = shardSplit; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -876,6 +966,9 @@ public void writeTo(StreamOutput out) throws IOException { | |
| if (out.getTransportVersion().onOrAfter(TransportVersions.V_8_15_0)) { | ||
| eventIngestedRange.writeTo(out); | ||
| } | ||
| if (out.getTransportVersion().onOrAfter(TransportVersions.SOURCE_PRIMARY_TERM_IN_START_SHARD)) { | ||
| out.writeOptionalWriteable(shardSplit); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -891,20 +984,20 @@ public String toString() { | |
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) return true; | ||
| if (o == null || getClass() != o.getClass()) return false; | ||
| StartedShardEntry that = (StartedShardEntry) o; | ||
| return primaryTerm == that.primaryTerm | ||
| && shardId.equals(that.shardId) | ||
| && allocationId.equals(that.allocationId) | ||
| && message.equals(that.message) | ||
| && timestampRange.equals(that.timestampRange) | ||
| && eventIngestedRange.equals(that.eventIngestedRange); | ||
| && Objects.equals(shardId, that.shardId) | ||
| && Objects.equals(allocationId, that.allocationId) | ||
| && Objects.equals(message, that.message) | ||
| && Objects.equals(timestampRange, that.timestampRange) | ||
| && Objects.equals(eventIngestedRange, that.eventIngestedRange) | ||
| && Objects.equals(shardSplit, that.shardSplit); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(shardId, allocationId, primaryTerm, message, timestampRange, eventIngestedRange); | ||
| return Objects.hash(shardId, allocationId, primaryTerm, message, timestampRange, eventIngestedRange, shardSplit); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -946,7 +1039,5 @@ public NoLongerPrimaryShardException(ShardId shardId, String msg) { | |
| public NoLongerPrimaryShardException(StreamInput in) throws IOException { | ||
| super(in); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| * - {@link PeerRecoverySource} recovery from a primary on another node | ||
| * - {@link SnapshotRecoverySource} recovery from a snapshot | ||
| * - {@link LocalShardsRecoverySource} recovery from other shards of another index on the same node | ||
| * - {@link SplitRecoverySource} recovery that is split from a source shard | ||
| */ | ||
| public abstract class RecoverySource implements Writeable, ToXContentObject { | ||
|
|
||
|
|
@@ -57,6 +58,7 @@ public static RecoverySource readFrom(StreamInput in) throws IOException { | |
| case PEER -> PeerRecoverySource.INSTANCE; | ||
| case SNAPSHOT -> new SnapshotRecoverySource(in); | ||
| case LOCAL_SHARDS -> LocalShardsRecoverySource.INSTANCE; | ||
| case SPLIT -> SplitRecoverySource.INSTANCE; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add this recovery type to the top of this class where the other ones are also listed. |
||
| }; | ||
| } | ||
|
|
||
|
|
@@ -78,7 +80,8 @@ public enum Type { | |
| EXISTING_STORE, | ||
| PEER, | ||
| SNAPSHOT, | ||
| LOCAL_SHARDS | ||
| LOCAL_SHARDS, | ||
| SPLIT | ||
| } | ||
|
|
||
| public abstract Type getType(); | ||
|
|
@@ -319,4 +322,36 @@ public boolean expectEmptyRetentionLeases() { | |
| return false; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * split recovery from a source primary shard | ||
| */ | ||
| public static class SplitRecoverySource extends RecoverySource { | ||
|
|
||
| public static final SplitRecoverySource INSTANCE = new SplitRecoverySource(); | ||
|
|
||
| private SplitRecoverySource() {} | ||
|
|
||
| @Override | ||
| public Type getType() { | ||
| return Type.SPLIT; | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "split recovery"; | ||
| } | ||
|
|
||
| @Override | ||
| protected void writeAdditionalFields(StreamOutput out) throws IOException { | ||
| super.writeAdditionalFields(out); | ||
| } | ||
|
|
||
| @Override | ||
| public void addAdditionalFields(XContentBuilder builder, Params params) throws IOException { | ||
| super.addAdditionalFields(builder, params); | ||
| } | ||
|
|
||
| // TODO: Expect empty retention leases? | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit, do we have a custom of
invalidfunctions that returnfalseif the state is valid? It seems like a double negative and usingvalidwould be more natural to me, unless that is against custom.