| 
 | 1 | +/*  | 
 | 2 | + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one  | 
 | 3 | + * or more contributor license agreements. Licensed under the "Elastic License  | 
 | 4 | + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side  | 
 | 5 | + * Public License v 1"; you may not use this file except in compliance with, at  | 
 | 6 | + * your election, the "Elastic License 2.0", the "GNU Affero General Public  | 
 | 7 | + * License v3.0 only", or the "Server Side Public License, v 1".  | 
 | 8 | + */  | 
 | 9 | + | 
 | 10 | +package org.elasticsearch.cluster.metadata;  | 
 | 11 | + | 
 | 12 | +import org.elasticsearch.common.io.stream.StreamInput;  | 
 | 13 | +import org.elasticsearch.common.io.stream.StreamOutput;  | 
 | 14 | +import org.elasticsearch.common.io.stream.Writeable;  | 
 | 15 | +import org.elasticsearch.xcontent.ConstructingObjectParser;  | 
 | 16 | +import org.elasticsearch.xcontent.ParseField;  | 
 | 17 | +import org.elasticsearch.xcontent.ToXContentFragment;  | 
 | 18 | +import org.elasticsearch.xcontent.XContentBuilder;  | 
 | 19 | +import org.elasticsearch.xcontent.XContentParser;  | 
 | 20 | + | 
 | 21 | +import java.io.IOException;  | 
 | 22 | +import java.util.Arrays;  | 
 | 23 | +import java.util.List;  | 
 | 24 | +import java.util.Objects;  | 
 | 25 | + | 
 | 26 | +/**  | 
 | 27 | + * IndexReshardingMetadata holds persistent state managing an in-flight index resharding operation  | 
 | 28 | + *  | 
 | 29 | + * Resharding is changing the number of shards that make up an index, in place.  | 
 | 30 | + * We currently only support splitting an index into an integer multiple of its current shard count,  | 
 | 31 | + * e.g., going from 1 to 3 shards, or 2 to 4. This is because we route documents to shards by hash of  | 
 | 32 | + * the document id modulo the shard count. Multiplying the shard count under this scheme lets us move  | 
 | 33 | + * only the fraction of the documents that route to new shards while the rest stay where they were.  | 
 | 34 | + *  | 
 | 35 | + * During a split, we create new shards and then migrate the documents that belong to the new shards  | 
 | 36 | + * according to the routing function to those new shards. While we're moving documents, search requests  | 
 | 37 | + * may be ongoing, or new documents may be indexed. There must not be ambiguity about whether the source  | 
 | 38 | + * shard or the target shards are responsible for documents being indexed or searched while this handoff  | 
 | 39 | + * is occurring, to ensure that we don't lose or double-count documents during the process. We prevent this  | 
 | 40 | + * by maintaining the state of the split on the source and target shards, and making an atomic (from the point  | 
 | 41 | + * of view of indexing and search requests) transition from handling requests that route to the target shard  | 
 | 42 | + * on the source shard, to letting the target shard handle them.  | 
 | 43 | + *  | 
 | 44 | + * Before the handoff, the source shard has the entire document collection for both the source and target, and handles  | 
 | 45 | + * indexing and search requests. After the handoff, documents that route to the target are handled by the target,  | 
 | 46 | + * and the source does not necessarily have a complete view - it will be missing any documents that are indexed  | 
 | 47 | + * to the target shard after handoff. Indeed, when the target becomes active, the source filters target documents  | 
 | 48 | + * from its search results, so that they are not counted twice when the target shard is also searched. The handoff  | 
 | 49 | + * is performed at the target by queueing incoming requests prior to entering handoff, waiting for the target to  | 
 | 50 | + * be RUNNING, and then forwarding requests for the target shard to the target. Similarly, when the target first  | 
 | 51 | + * becomes active it must filter out search results containing documents owned by the source shard, which may be  | 
 | 52 | + * present if the target was created by copying the source shard's Lucene files.  | 
 | 53 | + *  | 
 | 54 | + * To ensure that we always route requests to the correct shard, even in the case of failure of either source or  | 
 | 55 | + * target shards during split, we preserve the transition point in persistent state until the split is complete, so  | 
 | 56 | + * that when the source or target recovers, it can resync and route correctly based on that state. This class holds  | 
 | 57 | + * the persistent state required to recover correctly, always maintaining the invariant that only the source shard  | 
 | 58 | + * accepts indexing and search requests for the target prior to handoff, and only the target shard accepts them afterward.  | 
 | 59 | + *  | 
 | 60 | + * The state we preserve is:  | 
 | 61 | + * * The old and new shard counts for a resize operation, so that we can always identify which shards are sources  | 
 | 62 | + *   and which are targets during resharding. For example, old:2 new:6 implies that shard 1 is the source shard for  | 
 | 63 | + *   shards 3 and 5, and shard 2 is the source for shards 4 and 6.  | 
 | 64 | + * * For each source shard, its current source state, either `SOURCE` or `DONE`.  | 
 | 65 | + *   - If a source shard may still contain data for any target shard then it is in state `SOURCE`.  | 
 | 66 | + *   - When all targets for a source have moved to `SPLIT` (see below), then the source deletes all documents from  | 
 | 67 | + *     its store that are now the responsibility of the target shards and transitions to `DONE`.  | 
 | 68 | + *   This isn't strictly required to be persistent for correctness, but it can save time on recovery  | 
 | 69 | + *   by allowing a DONE shard to skip interrogating targets and repeating cleanup.  | 
 | 70 | + * * For each target shard, its current target state, one of `CLONE`, `HANDOFF`, `SPLIT`, or `DONE`.  | 
 | 71 | + *   - If the target has not yet copied all data from the source shard, then it is in `CLONE`.  | 
 | 72 | + *   - It moves to `HANDOFF` when it has copied all of its data from the source to indicate that it is now ready to  | 
 | 73 | + *     receive indexing actions, and starts RUNNING. After this point, the source may no longer contain the entire contents  | 
 | 74 | + *     of the target and must not index documents belonging to the target. But since search shards can't start up until  | 
 | 75 | + *     their corresponding index nodes are RUNNING, search requests would fail if they routed to the target shard immediately  | 
 | 76 | + *     after handoff. So at HANDOFF, the source shards continue to service searches, but block refresh since they cannot  | 
 | 77 | + *     be guaranteed to have seen documents indexed after HANDOFF.  | 
 | 78 | + *   - When the target shard's corresponding search replica has started running, the target requests that the source filter  | 
 | 79 | + *     search results belonging to the target, and moves the target shard's state moves to `SPLIT`. The target's search replica  | 
 | 80 | + *     likewise filters documents not belonging to the target, which may be present due to the target bootstrapping by copying  | 
 | 81 | + *     the source's lucene files.  | 
 | 82 | + *   - Upon entering `SPLIT`, the target starts deleting all documents from its lucene store that do not belong to it. When that  | 
 | 83 | + *     is complete, it moves to `DONE` and removes filters for other shards, which are no longer necessary.  | 
 | 84 | + *  | 
 | 85 | + * Note that each target shard's split operates independently and all may happen concurrently.  | 
 | 86 | + *  | 
 | 87 | + * When all source shards have transitioned to `DONE`, the resize is complete and this metadata may be removed from cluster state.  | 
 | 88 | + * We only allow at most a single resharding operation to be in flight for an index, so removing this metadata is a prerequisite  | 
 | 89 | + * to beginning another resharding operation.  | 
 | 90 | + */  | 
 | 91 | +public record IndexReshardingMetadata(  | 
 | 92 | +    int oldShardCount,  | 
 | 93 | +    int newShardCount,  | 
 | 94 | +    SourceShardState[] sourceShardStates,  | 
 | 95 | +    TargetShardState[] targetShardStates  | 
 | 96 | +) implements ToXContentFragment, Writeable {  | 
 | 97 | +    public enum SourceShardState implements Writeable {  | 
 | 98 | +        SOURCE,  | 
 | 99 | +        DONE;  | 
 | 100 | + | 
 | 101 | +        @Override  | 
 | 102 | +        public void writeTo(StreamOutput out) throws IOException {  | 
 | 103 | +            out.writeEnum(this);  | 
 | 104 | +        }  | 
 | 105 | +    }  | 
 | 106 | + | 
 | 107 | +    public enum TargetShardState implements Writeable {  | 
 | 108 | +        CLONE,  | 
 | 109 | +        HANDOFF,  | 
 | 110 | +        SPLIT,  | 
 | 111 | +        DONE;  | 
 | 112 | + | 
 | 113 | +        @Override  | 
 | 114 | +        public void writeTo(StreamOutput out) throws IOException {  | 
 | 115 | +            out.writeEnum(this);  | 
 | 116 | +        }  | 
 | 117 | +    }  | 
 | 118 | + | 
 | 119 | +    // Copying from IndexMetadataStats here  | 
 | 120 | +    public static final ParseField OLD_SHARD_COUNT_FIELD = new ParseField("old_shard_count");  | 
 | 121 | +    public static final ParseField NEW_SHARD_COUNT_FIELD = new ParseField("new_shard_count");  | 
 | 122 | +    public static final ParseField SOURCE_SHARD_STATES_FIELD = new ParseField("source_shard_states");  | 
 | 123 | +    public static final ParseField TARGET_SHARD_STATES_FIELD = new ParseField("target_shard_states");  | 
 | 124 | + | 
 | 125 | +    @SuppressWarnings("unchecked")  | 
 | 126 | +    private static final ConstructingObjectParser<IndexReshardingMetadata, Void> PARSER = new ConstructingObjectParser<>(  | 
 | 127 | +        "index_resharding_metadata_parser",  | 
 | 128 | +        false,  | 
 | 129 | +        (args, unused) -> new IndexReshardingMetadata(  | 
 | 130 | +            (int) args[0],  | 
 | 131 | +            (int) args[1],  | 
 | 132 | +            ((List<SourceShardState>) args[2]).toArray(new SourceShardState[0]),  | 
 | 133 | +            ((List<TargetShardState>) args[3]).toArray(new TargetShardState[0])  | 
 | 134 | +        )  | 
 | 135 | +    );  | 
 | 136 | + | 
 | 137 | +    static {  | 
 | 138 | +        PARSER.declareInt(ConstructingObjectParser.constructorArg(), OLD_SHARD_COUNT_FIELD);  | 
 | 139 | +        PARSER.declareInt(ConstructingObjectParser.constructorArg(), NEW_SHARD_COUNT_FIELD);  | 
 | 140 | +        // XXX I'm not sure this is the best way to parse an array of enums  | 
 | 141 | +        PARSER.declareObjectArray(  | 
 | 142 | +            ConstructingObjectParser.constructorArg(),  | 
 | 143 | +            (parser, c) -> SourceShardState.valueOf(parser.text()),  | 
 | 144 | +            SOURCE_SHARD_STATES_FIELD  | 
 | 145 | +        );  | 
 | 146 | +        PARSER.declareObjectArray(  | 
 | 147 | +            ConstructingObjectParser.constructorArg(),  | 
 | 148 | +            (parser, c) -> TargetShardState.valueOf(parser.text()),  | 
 | 149 | +            TARGET_SHARD_STATES_FIELD  | 
 | 150 | +        );  | 
 | 151 | +    }  | 
 | 152 | + | 
 | 153 | +    static IndexReshardingMetadata fromXContent(XContentParser parser) throws IOException {  | 
 | 154 | +        return PARSER.parse(parser, null);  | 
 | 155 | +    }  | 
 | 156 | + | 
 | 157 | +    @Override  | 
 | 158 | +    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {  | 
 | 159 | +        builder.field(OLD_SHARD_COUNT_FIELD.getPreferredName(), oldShardCount);  | 
 | 160 | +        builder.field(NEW_SHARD_COUNT_FIELD.getPreferredName(), newShardCount);  | 
 | 161 | +        builder.field(SOURCE_SHARD_STATES_FIELD.getPreferredName(), sourceShardStates);  | 
 | 162 | +        builder.field(TARGET_SHARD_STATES_FIELD.getPreferredName(), targetShardStates);  | 
 | 163 | +        return builder;  | 
 | 164 | +    }  | 
 | 165 | + | 
 | 166 | +    @Override  | 
 | 167 | +    public void writeTo(StreamOutput out) throws IOException {  | 
 | 168 | +        out.writeInt(oldShardCount);  | 
 | 169 | +        out.writeInt(newShardCount);  | 
 | 170 | +        out.writeArray(sourceShardStates);  | 
 | 171 | +        out.writeArray(targetShardStates);  | 
 | 172 | +    }  | 
 | 173 | + | 
 | 174 | +    public IndexReshardingMetadata(StreamInput in) throws IOException {  | 
 | 175 | +        this(  | 
 | 176 | +            in.readInt(),  | 
 | 177 | +            in.readInt(),  | 
 | 178 | +            in.readArray(i -> i.readEnum(SourceShardState.class), SourceShardState[]::new),  | 
 | 179 | +            in.readArray(i -> i.readEnum(TargetShardState.class), TargetShardState[]::new)  | 
 | 180 | +        );  | 
 | 181 | +    }  | 
 | 182 | + | 
 | 183 | +    public IndexReshardingMetadata(int oldShardCount, int newShardCount) {  | 
 | 184 | +        this(  | 
 | 185 | +            oldShardCount,  | 
 | 186 | +            newShardCount,  | 
 | 187 | +            initialSourceShardStates(oldShardCount),  | 
 | 188 | +            initialTargetShardStates(newShardCount - oldShardCount)  | 
 | 189 | +        );  | 
 | 190 | +    }  | 
 | 191 | + | 
 | 192 | +    public IndexReshardingMetadata(  | 
 | 193 | +        int oldShardCount,  | 
 | 194 | +        int newShardCount,  | 
 | 195 | +        SourceShardState[] sourceShardStates,  | 
 | 196 | +        TargetShardState[] targetShardStates  | 
 | 197 | +    ) {  | 
 | 198 | +        assert newShardCount > oldShardCount : "Reshard currently only supports increasing the number of shards";  | 
 | 199 | +        assert newShardCount / oldShardCount * oldShardCount == newShardCount : "New shard count must be multiple of old shard count";  | 
 | 200 | +        assert sourceShardStates.length == oldShardCount : "Must be one source shard state for each old shard";  | 
 | 201 | +        assert targetShardStates.length == newShardCount - oldShardCount : "Must be one target shard state for each new shard";  | 
 | 202 | + | 
 | 203 | +        this.oldShardCount = oldShardCount;  | 
 | 204 | +        this.newShardCount = newShardCount;  | 
 | 205 | +        this.sourceShardStates = sourceShardStates;  | 
 | 206 | +        this.targetShardStates = targetShardStates;  | 
 | 207 | +    }  | 
 | 208 | + | 
 | 209 | +    // can't use record implementation because we need a deep comparison of targetShardStates  | 
 | 210 | +    @Override  | 
 | 211 | +    public boolean equals(Object other) {  | 
 | 212 | +        if (this == other) {  | 
 | 213 | +            return true;  | 
 | 214 | +        }  | 
 | 215 | +        if (other == null || getClass() != other.getClass()) {  | 
 | 216 | +            return false;  | 
 | 217 | +        }  | 
 | 218 | +        IndexReshardingMetadata otherMetadata = (IndexReshardingMetadata) other;  | 
 | 219 | +        return oldShardCount == otherMetadata.oldShardCount  | 
 | 220 | +            && newShardCount == otherMetadata.newShardCount  | 
 | 221 | +            && Arrays.equals(sourceShardStates, otherMetadata.sourceShardStates)  | 
 | 222 | +            && Arrays.equals(targetShardStates, otherMetadata.targetShardStates);  | 
 | 223 | +    }  | 
 | 224 | + | 
 | 225 | +    @Override  | 
 | 226 | +    public int hashCode() {  | 
 | 227 | +        return Objects.hash(oldShardCount, newShardCount, Arrays.hashCode(sourceShardStates), Arrays.hashCode(targetShardStates));  | 
 | 228 | +    }  | 
 | 229 | + | 
 | 230 | +    public void setTargetShardState(int shard, TargetShardState shardState) {  | 
 | 231 | +        targetShardStates[shard] = shardState;  | 
 | 232 | +    }  | 
 | 233 | + | 
 | 234 | +    public TargetShardState getTargetShardState(int shard) {  | 
 | 235 | +        return targetShardStates[shard];  | 
 | 236 | +    }  | 
 | 237 | + | 
 | 238 | +    private static SourceShardState[] initialSourceShardStates(int sourceShardCount) {  | 
 | 239 | +        SourceShardState[] sourceShardStates = new SourceShardState[sourceShardCount];  | 
 | 240 | +        Arrays.fill(sourceShardStates, SourceShardState.SOURCE);  | 
 | 241 | +        return sourceShardStates;  | 
 | 242 | +    }  | 
 | 243 | + | 
 | 244 | +    private static TargetShardState[] initialTargetShardStates(int targetShardCount) {  | 
 | 245 | +        TargetShardState[] targetShardStates = new TargetShardState[targetShardCount];  | 
 | 246 | +        Arrays.fill(targetShardStates, TargetShardState.CLONE);  | 
 | 247 | +        return targetShardStates;  | 
 | 248 | +    }  | 
 | 249 | +}  | 
0 commit comments