Trigger merges after recovery (#113102)

DaveCTurner · web-flow · commit 33a73a8111fb · 2024-09-20T17:16:03.000+01:00
We may have shut a shard down while merges were still pending (or
adjusted the merge policy while the shard was down) meaning that after
recovery its segments do not reflect the desired state according to the
merge policy. With this commit we invoke `IndexWriter#maybeMerge()` at
the end of recovery to check for, and execute, any such lost merges.
diff --git a/docs/changelog/113102.yaml b/docs/changelog/113102.yaml
@@ -0,0 +1,5 @@
+pr: 113102
+summary: Trigger merges after recovery
+area: Recovery
+type: enhancement
+issues: []
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
@@ -255,9 +255,9 @@ private Set<ShardId> getShardIds(final String nodeId, final String indexName) {
     /**
      * Index documents until all the shards are at least WATERMARK_BYTES in size, and return the one with the smallest size
      */
-    private ShardSizes createReasonableSizedShards(final String indexName) throws InterruptedException {
+    private ShardSizes createReasonableSizedShards(final String indexName) {
         while (true) {
-            indexRandom(true, indexName, scaledRandomIntBetween(100, 10000));
+            indexRandom(false, indexName, scaledRandomIntBetween(100, 10000));
             forceMerge();
             refresh();
 
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java
@@ -34,6 +34,7 @@
 import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
 import org.elasticsearch.action.admin.indices.recovery.RecoveryRequest;
 import org.elasticsearch.action.admin.indices.recovery.RecoveryResponse;
+import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;
 import org.elasticsearch.action.admin.indices.stats.CommonStatsFlags;
 import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
 import org.elasticsearch.action.admin.indices.stats.ShardStats;
@@ -86,6 +87,7 @@
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.engine.Engine;
+import org.elasticsearch.index.engine.Segment;
 import org.elasticsearch.index.mapper.MapperParsingException;
 import org.elasticsearch.index.mapper.SeqNoFieldMapper;
 import org.elasticsearch.index.recovery.RecoveryStats;
@@ -137,16 +139,21 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.Consumer;
+import java.util.function.LongSupplier;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import java.util.stream.Stream;
 
 import static java.util.Collections.singletonMap;
 import static java.util.stream.Collectors.toList;
 import static org.elasticsearch.action.DocWriteResponse.Result.CREATED;
 import static org.elasticsearch.action.DocWriteResponse.Result.UPDATED;
 import static org.elasticsearch.action.support.ActionTestUtils.assertNoFailureListener;
 import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING;
+import static org.elasticsearch.index.MergePolicyConfig.INDEX_MERGE_ENABLED;
 import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
+import static org.elasticsearch.indices.IndexingMemoryController.SHARD_INACTIVE_TIME_SETTING;
+import static org.elasticsearch.node.NodeRoleSettings.NODE_ROLES_SETTING;
 import static org.elasticsearch.node.RecoverySettingsChunkSizePlugin.CHUNK_SIZE_SETTING;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
@@ -158,6 +165,7 @@
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.instanceOf;
 import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.lessThan;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.notNullValue;
@@ -1957,6 +1965,77 @@ public void accept(long globalCheckpoint, Exception e) {
         recoveryCompleteListener.onResponse(null);
     }
 
+    public void testPostRecoveryMerge() throws Exception {
+        internalCluster().startMasterOnlyNode();
+        final var dataNode = internalCluster().startDataOnlyNode();
+        final var indexName = randomIdentifier();
+        createIndex(indexName, indexSettings(1, 0).put(INDEX_MERGE_ENABLED, false).build());
+
+        final var initialSegmentCount = 20;
+        for (int i = 0; i < initialSegmentCount; i++) {
+            indexDoc(indexName, Integer.toString(i), "f", randomAlphaOfLength(10));
+            refresh(indexName); // force a one-doc segment
+        }
+        flush(indexName); // commit all the one-doc segments
+
+        final LongSupplier searchableSegmentCountSupplier = () -> indicesAdmin().prepareSegments(indexName)
+            .get(SAFE_AWAIT_TIMEOUT)
+            .getIndices()
+            .get(indexName)
+            .getShards()
+            .get(0)
+            .shards()[0].getSegments()
+            .stream()
+            .filter(Segment::isSearch)
+            .count();
+
+        assertEquals(initialSegmentCount, searchableSegmentCountSupplier.getAsLong());
+
+        // force a recovery by restarting the node, re-enabling merges while the node is down, but configure the node not to be in the hot
+        // or content tiers so that it does not do any post-recovery merge
+        internalCluster().restartNode(dataNode, new InternalTestCluster.RestartCallback() {
+            @Override
+            public Settings onNodeStopped(String nodeName) {
+                final var request = new UpdateSettingsRequest(Settings.builder().putNull(INDEX_MERGE_ENABLED).build(), indexName);
+                request.reopen(true);
+                safeGet(indicesAdmin().updateSettings(request));
+                return Settings.builder()
+                    .putList(NODE_ROLES_SETTING.getKey(), randomNonEmptySubsetOf(List.of("data_warm", "data_cold")))
+                    .build();
+            }
+        });
+
+        ensureGreen(indexName);
+        final var mergeStats = indicesAdmin().prepareStats(indexName).clear().setMerge(true).get().getIndex(indexName).getShards()[0]
+            .getStats()
+            .getMerge();
+        assertEquals(0, mergeStats.getCurrent());
+        assertEquals(0, mergeStats.getTotal());
+        assertEquals(initialSegmentCount, searchableSegmentCountSupplier.getAsLong());
+
+        // force a recovery by restarting the node again, but this time putting it into the hot or content tiers to enable post-recovery
+        // merges
+        internalCluster().restartNode(dataNode, new InternalTestCluster.RestartCallback() {
+            @Override
+            public Settings onNodeStopped(String nodeName) {
+                return Settings.builder()
+                    .putList(
+                        NODE_ROLES_SETTING.getKey(),
+                        Stream.concat(
+                            Stream.of(randomFrom("data", "data_content", "data_hot")),
+                            Stream.of("data", "data_content", "data_hot", "data_warm", "data_cold").filter(p -> randomBoolean())
+                        ).distinct().toList()
+                    )
+                    // set the inactive time to zero so that we flush immediately after every merge, rather than having the test wait 5min
+                    .put(SHARD_INACTIVE_TIME_SETTING.getKey(), TimeValue.ZERO)
+                    .build();
+            }
+        });
+
+        ensureGreen(indexName);
+        assertBusy(() -> assertThat(searchableSegmentCountSupplier.getAsLong(), lessThan((long) initialSegmentCount)));
+    }
+
     private void assertGlobalCheckpointIsStableAndSyncedInAllNodes(String indexName, List<String> nodes, int shard) throws Exception {
         assertThat(nodes, is(not(empty())));
 
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -1514,6 +1514,22 @@ public void forceMerge(ForceMergeRequest forceMerge) throws IOException {
         engine.forceMerge(forceMerge.flush(), forceMerge.maxNumSegments(), forceMerge.onlyExpungeDeletes(), forceMerge.forceMergeUUID());
     }
 
+    public void triggerPendingMerges() throws IOException {
+        switch (state /* single volatile read */) {
+            case STARTED, POST_RECOVERY -> getEngine().forceMerge(
+                // don't immediately flush - if any merging happens then we don't wait for it anyway
+                false,
+                // don't apply any segment count limit, we only want to call IndexWriter#maybeMerge
+                ForceMergeRequest.Defaults.MAX_NUM_SEGMENTS,
+                // don't look for expunge-delete merges, we only want to call IndexWriter#maybeMerge
+                false,
+                // force-merge UUID is not used when calling IndexWriter#maybeMerge
+                null
+            );
+            // otherwise shard likely closed and maybe reopened, nothing to do
+        }
+    }
+
     /**
      * Creates a new {@link IndexCommit} snapshot from the currently running engine. All resources referenced by this
      * commit won't be freed until the commit / snapshot is closed.
diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesService.java b/server/src/main/java/org/elasticsearch/indices/IndicesService.java
@@ -262,6 +262,7 @@ public class IndicesService extends AbstractLifecycleComponent
     private final TimestampFieldMapperService timestampFieldMapperService;
     private final CheckedBiConsumer<ShardSearchRequest, StreamOutput, IOException> requestCacheKeyDifferentiator;
     private final MapperMetrics mapperMetrics;
+    private final PostRecoveryMerger postRecoveryMerger;
 
     @Override
     protected void doStart() {
@@ -378,6 +379,8 @@ public void onRemoval(ShardId shardId, String fieldName, boolean wasEvicted, lon
         clusterService.getClusterSettings().addSettingsUpdateConsumer(ALLOW_EXPENSIVE_QUERIES, this::setAllowExpensiveQueries);
 
         this.timestampFieldMapperService = new TimestampFieldMapperService(settings, threadPool, this);
+
+        this.postRecoveryMerger = new PostRecoveryMerger(settings, threadPool.executor(ThreadPool.Names.FORCE_MERGE), this::getShardOrNull);
     }
 
     private static final String DANGLING_INDICES_UPDATE_THREAD_NAME = "DanglingIndices#updateTask";
@@ -890,23 +893,29 @@ public void createShard(
         RecoveryState recoveryState = indexService.createRecoveryState(shardRouting, targetNode, sourceNode);
         IndexShard indexShard = indexService.createShard(shardRouting, globalCheckpointSyncer, retentionLeaseSyncer);
         indexShard.addShardFailureCallback(onShardFailure);
-        indexShard.startRecovery(recoveryState, recoveryTargetService, recoveryListener, repositoriesService, (mapping, listener) -> {
-            assert recoveryState.getRecoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS
-                : "mapping update consumer only required by local shards recovery";
-            AcknowledgedRequest<PutMappingRequest> putMappingRequestAcknowledgedRequest = new PutMappingRequest().setConcreteIndex(
-                shardRouting.index()
-            )
-                .setConcreteIndex(shardRouting.index()) // concrete index - no name clash, it uses uuid
-                .source(mapping.source().string(), XContentType.JSON);
-            // concrete index - no name clash, it uses uuid
-            client.execute(
-                featureService.clusterHasFeature(clusterService.state(), SUPPORTS_AUTO_PUT)
-                    ? TransportAutoPutMappingAction.TYPE
-                    : TransportPutMappingAction.TYPE,
-                putMappingRequestAcknowledgedRequest.ackTimeout(TimeValue.MAX_VALUE).masterNodeTimeout(TimeValue.MAX_VALUE),
-                new RefCountAwareThreadedActionListener<>(threadPool.generic(), listener.map(ignored -> null))
-            );
-        }, this, clusterStateVersion);
+        indexShard.startRecovery(
+            recoveryState,
+            recoveryTargetService,
+            postRecoveryMerger.maybeMergeAfterRecovery(shardRouting, recoveryListener),
+            repositoriesService,
+            (mapping, listener) -> {
+                assert recoveryState.getRecoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS
+                    : "mapping update consumer only required by local shards recovery";
+                AcknowledgedRequest<PutMappingRequest> putMappingRequestAcknowledgedRequest = new PutMappingRequest()
+                    // concrete index - no name clash, it uses uuid
+                    .setConcreteIndex(shardRouting.index())
+                    .source(mapping.source().string(), XContentType.JSON);
+                client.execute(
+                    featureService.clusterHasFeature(clusterService.state(), SUPPORTS_AUTO_PUT)
+                        ? TransportAutoPutMappingAction.TYPE
+                        : TransportPutMappingAction.TYPE,
+                    putMappingRequestAcknowledgedRequest.ackTimeout(TimeValue.MAX_VALUE).masterNodeTimeout(TimeValue.MAX_VALUE),
+                    new RefCountAwareThreadedActionListener<>(threadPool.generic(), listener.map(ignored -> null))
+                );
+            },
+            this,
+            clusterStateVersion
+        );
     }
 
     @Override
diff --git a/server/src/main/java/org/elasticsearch/indices/PostRecoveryMerger.java b/server/src/main/java/org/elasticsearch/indices/PostRecoveryMerger.java
@@ -0,0 +1,145 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.indices;
+
+import org.apache.lucene.index.IndexWriter;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.util.concurrent.ThrottledTaskRunner;
+import org.elasticsearch.core.Releasable;
+import org.elasticsearch.core.Strings;
+import org.elasticsearch.index.shard.IndexShard;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.index.shard.ShardLongFieldRange;
+import org.elasticsearch.indices.recovery.PeerRecoveryTargetService;
+import org.elasticsearch.indices.recovery.RecoveryFailedException;
+import org.elasticsearch.indices.recovery.RecoveryState;
+import org.elasticsearch.logging.LogManager;
+import org.elasticsearch.logging.Logger;
+
+import java.util.concurrent.Executor;
+import java.util.function.Function;
+
+import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_CONTENT_NODE_ROLE;
+import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_HOT_NODE_ROLE;
+import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_ROLE;
+import static org.elasticsearch.cluster.node.DiscoveryNodeRole.INDEX_ROLE;
+
+/**
+ * Triggers a check for pending merges when a shard completes recovery.
+ */
+class PostRecoveryMerger {
+
+    private static final Logger logger = LogManager.getLogger(PostRecoveryMerger.class);
+
+    private static final boolean TRIGGER_MERGE_AFTER_RECOVERY;
+
+    static {
+        final var propertyValue = System.getProperty("es.trigger_merge_after_recovery");
+        if (propertyValue == null) {
+            TRIGGER_MERGE_AFTER_RECOVERY = true;
+        } else if ("false".equals(propertyValue)) {
+            TRIGGER_MERGE_AFTER_RECOVERY = false;
+        } else {
+            throw new IllegalStateException(
+                "system property [es.trigger_merge_after_recovery] may only be set to [false], but was [" + propertyValue + "]"
+            );
+        }
+    }
+
+    /**
+     * Throttled runner to avoid multiple concurrent calls to {@link IndexWriter#maybeMerge()}: we do not need to execute these things
+     * especially quickly, as long as they happen eventually, and each such call may involve some IO (reading the soft-deletes doc values to
+     * count deleted docs). Note that we're not throttling any actual merges, just the checks to see what merges might be needed. Throttling
+     * merges across shards is a separate issue, but normally this mechanism won't trigger any new merges anyway.
+     */
+    private final ThrottledTaskRunner postRecoveryMergeRunner;
+
+    private final Function<ShardId, IndexShard> shardFunction;
+    private final boolean enabled;
+
+    PostRecoveryMerger(Settings settings, Executor executor, Function<ShardId, IndexShard> shardFunction) {
+        this.postRecoveryMergeRunner = new ThrottledTaskRunner(getClass().getCanonicalName(), 1, executor);
+        this.shardFunction = shardFunction;
+        this.enabled =
+            // enabled globally ...
+            TRIGGER_MERGE_AFTER_RECOVERY
+                // ... and we are a node that expects nontrivial amounts of indexing work
+                && (DiscoveryNode.hasRole(settings, DATA_HOT_NODE_ROLE)
+                    || DiscoveryNode.hasRole(settings, DATA_CONTENT_NODE_ROLE)
+                    || DiscoveryNode.hasRole(settings, DATA_ROLE)
+                    || DiscoveryNode.hasRole(settings, INDEX_ROLE));
+    }
+
+    PeerRecoveryTargetService.RecoveryListener maybeMergeAfterRecovery(
+        ShardRouting shardRouting,
+        PeerRecoveryTargetService.RecoveryListener recoveryListener
+    ) {
+        if (enabled == false) {
+            return recoveryListener;
+        }
+
+        if (shardRouting.isPromotableToPrimary() == false) {
+            return recoveryListener;
+        }
+
+        final var shardId = shardRouting.shardId();
+        return new PeerRecoveryTargetService.RecoveryListener() {
+            @Override
+            public void onRecoveryDone(
+                RecoveryState state,
+                ShardLongFieldRange timestampMillisFieldRange,
+                ShardLongFieldRange eventIngestedMillisFieldRange
+            ) {
+                postRecoveryMergeRunner.enqueueTask(new PostRecoveryMerge(shardId));
+                recoveryListener.onRecoveryDone(state, timestampMillisFieldRange, eventIngestedMillisFieldRange);
+            }
+
+            @Override
+            public void onRecoveryFailure(RecoveryFailedException e, boolean sendShardFailure) {
+                recoveryListener.onRecoveryFailure(e, sendShardFailure);
+            }
+        };
+    }
+
+    class PostRecoveryMerge implements ActionListener<Releasable> {
+        private final ShardId shardId;
+
+        PostRecoveryMerge(ShardId shardId) {
+            this.shardId = shardId;
+        }
+
+        @Override
+        public void onResponse(Releasable releasable) {
+            try (releasable) {
+                final var indexShard = shardFunction.apply(shardId);
+                if (indexShard == null) {
+                    return;
+                }
+
+                indexShard.triggerPendingMerges();
+            } catch (Exception e) {
+                logFailure(e);
+            }
+        }
+
+        @Override
+        public void onFailure(Exception e) {
+            logFailure(e);
+        }
+
+        private void logFailure(Exception e) {
+            // post-recovery merge is a best-effort thing, failure needs no special handling
+            logger.debug(() -> Strings.format("failed to execute post-recovery merge of [%s]", shardId), e);
+        }
+    }
+}