From bc87adc14a1c379c03e4e19a9b3d02c13c1a1181 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Tue, 18 Mar 2025 19:32:49 +0200 Subject: [PATCH 01/14] Threadpool merge scheduler (#120869) This adds a new merge scheduler implementation that uses a (new) dedicated thread pool to run the merges. This way the number of concurrent merges is limited to the number of threads in the pool (i.e. the number of allocated processors to the ES JVM). It implements dynamic IO throttling (the same target IO rate for all merges, roughly, with caveats) that's adjusted based on the number of currently active (queued + running) merges. Smaller merges are always preferred to larger ones, irrespective of the index shard that they're coming from. The implementation also supports the per-shard "max thread count" and "max merge count" settings, the later being used today for indexing throttling. Note that IO throttling, max merge count, and max thread count work similarly, but not identical, to their siblings in the ConcurrentMergeScheduler. The per-shard merge statistics are not affected, and the thread-pool statistics should reflect the merge ones (i.e. the completed thread pool stats reflects the total number of merges, across shards, per node). --- docs/changelog/120869.yaml | 5 + .../index/engine/InternalEngineMergeIT.java | 74 +- .../ThreadPoolMergeSchedulerStressTestIT.java | 313 ++++++++ .../index/shard/IndexShardIT.java | 1 + .../indices/IndexingMemoryControllerIT.java | 1 + .../common/settings/ClusterSettings.java | 2 + .../org/elasticsearch/index/IndexModule.java | 3 + .../org/elasticsearch/index/IndexService.java | 10 + .../index/engine/EngineConfig.java | 8 + .../index/engine/InternalEngine.java | 122 +++- .../ThreadPoolMergeExecutorService.java | 304 ++++++++ .../engine/ThreadPoolMergeScheduler.java | 529 ++++++++++++++ .../elasticsearch/index/shard/IndexShard.java | 6 + .../elasticsearch/indices/IndicesService.java | 13 + .../DefaultBuiltInExecutorBuilders.java | 7 + .../elasticsearch/threadpool/ThreadPool.java | 2 + .../elasticsearch/index/IndexModuleTests.java | 6 + .../index/engine/InternalEngineTests.java | 14 +- .../ThreadPoolMergeExecutorServiceTests.java | 690 ++++++++++++++++++ .../engine/ThreadPoolMergeSchedulerTests.java | 496 +++++++++++++ .../shard/IndexShardRetentionLeaseTests.java | 4 +- .../index/shard/IndexShardTests.java | 1 + .../index/shard/RefreshListenersTests.java | 10 +- .../index/engine/EngineTestCase.java | 12 + .../index/shard/IndexShardTestCase.java | 14 +- .../elasticsearch/test/ESIntegTestCase.java | 29 + .../test/InternalTestCluster.java | 15 + .../index/engine/FollowingEngineTests.java | 43 +- .../RetrySearchIntegTests.java | 6 +- .../SnapshotBasedIndexRecoveryIT.java | 2 +- 30 files changed, 2682 insertions(+), 60 deletions(-) create mode 100644 docs/changelog/120869.yaml create mode 100644 server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java create mode 100644 server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java create mode 100644 server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java create mode 100644 server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java create mode 100644 server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java diff --git a/docs/changelog/120869.yaml b/docs/changelog/120869.yaml new file mode 100644 index 0000000000000..024d51db894fd --- /dev/null +++ b/docs/changelog/120869.yaml @@ -0,0 +1,5 @@ +pr: 120869 +summary: Threadpool merge scheduler +area: Engine +type: feature +issues: [] diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java index 80de2ffcaa7ac..243e4219ffef1 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java @@ -8,24 +8,40 @@ */ package org.elasticsearch.index.engine; +import org.elasticsearch.action.admin.cluster.node.stats.NodeStats; +import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse; import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase.ClusterScope; import org.elasticsearch.test.ESIntegTestCase.Scope; +import org.elasticsearch.threadpool.ThreadPool; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.lessThan; import static org.hamcrest.Matchers.lessThanOrEqualTo; -@ClusterScope(supportsDedicatedMasters = false, numDataNodes = 1, scope = Scope.SUITE) +@ClusterScope(supportsDedicatedMasters = false, numDataNodes = 1, numClientNodes = 0, scope = Scope.TEST) public class InternalEngineMergeIT extends ESIntegTestCase { + private boolean useThreadPoolMerging; + + @Override + protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { + useThreadPoolMerging = randomBoolean(); + Settings.Builder settings = Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings)); + settings.put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), useThreadPoolMerging); + return settings.build(); + } + public void testMergesHappening() throws Exception { final int numOfShards = randomIntBetween(1, 5); // some settings to keep num segments low @@ -83,4 +99,60 @@ public void testMergesHappening() throws Exception { assertThat(count, lessThanOrEqualTo(upperNumberSegments)); } + public void testMergesUseTheMergeThreadPool() throws Exception { + final String indexName = randomIdentifier(); + createIndex(indexName, indexSettings(randomIntBetween(1, 3), 0).build()); + long id = 0; + final int minMerges = randomIntBetween(1, 5); + long totalDocs = 0; + + while (true) { + int docs = randomIntBetween(100, 200); + totalDocs += docs; + + BulkRequestBuilder request = client().prepareBulk(); + for (int j = 0; j < docs; ++j) { + request.add( + new IndexRequest(indexName).id(Long.toString(id++)) + .source(jsonBuilder().startObject().field("l", randomLong()).endObject()) + ); + } + BulkResponse response = request.get(); + assertNoFailures(response); + refresh(indexName); + + var mergesResponse = client().admin().indices().prepareStats(indexName).clear().setMerge(true).get(); + var primaries = mergesResponse.getIndices().get(indexName).getPrimaries(); + if (primaries.merge.getTotal() >= minMerges) { + break; + } + } + + forceMerge(); + refresh(indexName); + + // after a force merge there should only be 1 segment per shard + var shardsWithMultipleSegments = getShardSegments().stream() + .filter(shardSegments -> shardSegments.getSegments().size() > 1) + .toList(); + assertTrue("there are shards with multiple segments " + shardsWithMultipleSegments, shardsWithMultipleSegments.isEmpty()); + + final long expectedTotalDocs = totalDocs; + assertHitCount(prepareSearch(indexName).setQuery(QueryBuilders.matchAllQuery()).setTrackTotalHits(true), expectedTotalDocs); + + IndicesStatsResponse indicesStats = client().admin().indices().prepareStats(indexName).setMerge(true).get(); + long mergeCount = indicesStats.getIndices().get(indexName).getPrimaries().merge.getTotal(); + NodesStatsResponse nodesStatsResponse = client().admin().cluster().prepareNodesStats().setThreadPool(true).get(); + assertThat(nodesStatsResponse.getNodes().size(), equalTo(1)); + + NodeStats nodeStats = nodesStatsResponse.getNodes().get(0); + if (useThreadPoolMerging) { + assertThat( + nodeStats.getThreadPool().stats().stream().filter(s -> ThreadPool.Names.MERGE.equals(s.name())).findAny().get().completed(), + equalTo(mergeCount) + ); + } else { + assertTrue(nodeStats.getThreadPool().stats().stream().filter(s -> ThreadPool.Names.MERGE.equals(s.name())).findAny().isEmpty()); + } + } } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java new file mode 100644 index 0000000000000..1743ca1996055 --- /dev/null +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java @@ -0,0 +1,313 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.index.MergePolicy.OneMerge; +import org.apache.lucene.index.MergeScheduler; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.store.Directory; +import org.elasticsearch.action.admin.indices.segments.IndexShardSegments; +import org.elasticsearch.action.admin.indices.segments.IndicesSegmentResponse; +import org.elasticsearch.action.admin.indices.segments.ShardSegments; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.CollectionUtils; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.MergePolicyConfig; +import org.elasticsearch.index.MergeSchedulerConfig; +import org.elasticsearch.index.merge.MergeStats; +import org.elasticsearch.index.merge.OnGoingMerge; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.plugins.EnginePlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.plugins.PluginsService; +import org.elasticsearch.test.ESSingleNodeTestCase; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Executor; +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAllSuccessful; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class ThreadPoolMergeSchedulerStressTestIT extends ESSingleNodeTestCase { + + private static final int MERGE_SCHEDULER_MAX_CONCURRENCY = 3; + + @Override + protected Settings nodeSettings() { + return Settings.builder() + .put(super.nodeSettings()) + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + // when there are more threads than scheduler(s)' concurrency capacity, excess merges will be backlogged + // alternatively, when scheduler(s)' concurrency capacity exceeds the executor's thread count, excess merges will be enqueued + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), MERGE_SCHEDULER_MAX_CONCURRENCY + randomFrom(-2, -1, 0, 1, 2)) + .build(); + } + + @Override + protected Collection> getPlugins() { + return CollectionUtils.appendToCopy(super.getPlugins(), ThreadPoolMergeSchedulerStressTestIT.TestEnginePlugin.class); + } + + public static class TestEnginePlugin extends Plugin implements EnginePlugin { + + final AtomicReference mergeExecutorServiceReference = new AtomicReference<>(); + final Set enqueuedMergesSet = ConcurrentCollections.newConcurrentSet(); + final Set runningMergesSet = ConcurrentCollections.newConcurrentSet(); + // maybe let a few merges run at the start + final int initialRunMergesCount = randomIntBetween(0, 5); + final Semaphore runMergeSemaphore = new Semaphore(initialRunMergesCount); + final int waitMergesEnqueuedCount = randomIntBetween(50, 100); + + void allowAllMerging() { + // even when indexing is done, queued and backlogged merges can themselves trigger further merging + // don't let this test be bothered by that, and simply let all merging run unhindered + runMergeSemaphore.release(Integer.MAX_VALUE - initialRunMergesCount); + } + + class TestInternalEngine extends org.elasticsearch.index.engine.InternalEngine { + + TestInternalEngine(EngineConfig engineConfig) { + super(engineConfig); + } + + protected ElasticsearchMergeScheduler createMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + @Nullable ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + ElasticsearchMergeScheduler mergeScheduler = super.createMergeScheduler( + shardId, + indexSettings, + threadPoolMergeExecutorService + ); + assertThat(mergeScheduler, instanceOf(ThreadPoolMergeScheduler.class)); + // assert there is a single merge executor service for all shards + mergeExecutorServiceReference.compareAndSet(null, threadPoolMergeExecutorService); + assertThat(mergeExecutorServiceReference.get(), is(threadPoolMergeExecutorService)); + return new TestMergeScheduler((ThreadPoolMergeScheduler) mergeScheduler); + } + + class TestMergeScheduler implements ElasticsearchMergeScheduler { + + ThreadPoolMergeScheduler delegateMergeScheduler; + + TestMergeScheduler(ThreadPoolMergeScheduler threadPoolMergeScheduler) { + this.delegateMergeScheduler = threadPoolMergeScheduler; + } + + @Override + public Set onGoingMerges() { + return delegateMergeScheduler.onGoingMerges(); + } + + @Override + public MergeStats stats() { + return delegateMergeScheduler.stats(); + } + + @Override + public void refreshConfig() { + delegateMergeScheduler.refreshConfig(); + } + + @Override + public MergeScheduler getMergeScheduler() { + return new MergeScheduler() { + @Override + public void merge(MergeSource mergeSource, MergeTrigger trigger) { + delegateMergeScheduler.merge(new MergeSource() { + @Override + public OneMerge getNextMerge() { + OneMerge nextMerge = mergeSource.getNextMerge(); + if (nextMerge != null) { + assertTrue(TestEnginePlugin.this.enqueuedMergesSet.add(nextMerge)); + // avoid excess merges pilling up + if (TestEnginePlugin.this.enqueuedMergesSet + .size() > TestEnginePlugin.this.waitMergesEnqueuedCount) { + runMergeSemaphore.release(); + } + } + return nextMerge; + } + + @Override + public void onMergeFinished(OneMerge merge) { + mergeSource.onMergeFinished(merge); + } + + @Override + public boolean hasPendingMerges() { + return mergeSource.hasPendingMerges(); + } + + @Override + public void merge(OneMerge merge) throws IOException { + assertNotNull(merge); + try { + // most merges need to acquire the semaphore in order to run + if (frequently()) { + runMergeSemaphore.acquire(); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + // assert to-be-run merge was enqueued + assertTrue(TestEnginePlugin.this.enqueuedMergesSet.remove(merge)); + TestEnginePlugin.this.runningMergesSet.add(merge); + assertThat( + TestEnginePlugin.this.runningMergesSet.size(), + lessThanOrEqualTo( + TestEnginePlugin.this.mergeExecutorServiceReference.get().getMaxConcurrentMerges() + ) + ); + mergeSource.merge(merge); + assertTrue(TestEnginePlugin.this.runningMergesSet.remove(merge)); + } + }, trigger); + } + + @Override + public Directory wrapForMerge(OneMerge merge, Directory in) { + return delegateMergeScheduler.wrapForMerge(merge, in); + } + + @Override + public Executor getIntraMergeExecutor(OneMerge merge) { + return delegateMergeScheduler.getIntraMergeExecutor(merge); + } + + @Override + public void close() throws IOException { + delegateMergeScheduler.close(); + } + }; + } + } + } + + @Override + public Optional getEngineFactory(IndexSettings indexSettings) { + return Optional.of(TestInternalEngine::new); + } + + } + + public void testMergingFallsBehindAndThenCatchesUp() throws Exception { + createIndex( + "index", + // stress test merging across multiple shards + indexSettings(randomIntBetween(1, 10), 0) + // few segments per merge ought to result in more merging activity + .put(MergePolicyConfig.INDEX_MERGE_POLICY_MAX_MERGE_AT_ONCE_SETTING.getKey(), randomIntBetween(2, 3)) + .put(MergePolicyConfig.INDEX_MERGE_POLICY_SEGMENTS_PER_TIER_SETTING.getKey(), randomIntBetween(2, 3)) + // few concurrent merges allowed per scheduler + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), randomIntBetween(1, MERGE_SCHEDULER_MAX_CONCURRENCY)) + // many pending merges allowed, in order to disable indexing throttle + .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), randomIntBetween(1, Integer.MAX_VALUE)) + .build() + ); + ensureGreen("index"); + // generate merging activity across many threads + Thread[] indexingThreads = new Thread[randomIntBetween(20, 30)]; + AtomicBoolean indexingDone = new AtomicBoolean(false); + for (int i = 0; i < indexingThreads.length; i++) { + int finalI = i; + indexingThreads[i] = new Thread(() -> { + long termUpto = 0; + while (indexingDone.get() == false) { + for (int j = 0; j < 100; j++) { + // Provoke slowish merging by making many unique terms: + StringBuilder sb = new StringBuilder(); + for (int k = 0; k < 100; k++) { + sb.append(' '); + sb.append(termUpto++); + } + prepareIndex("index").setId("thread_" + finalI + "_term_" + termUpto) + .setSource("field" + (j % 10), sb.toString()) + .get(); + if (j % 2 == 0) { + indicesAdmin().prepareRefresh("index").get(); + } + } + indicesAdmin().prepareRefresh("index").get(); + } + }); + indexingThreads[i].start(); + } + TestEnginePlugin testEnginePlugin = getTestEnginePlugin(); + assertBusy(() -> { + // wait for merges to enqueue or backlog + assertThat(testEnginePlugin.enqueuedMergesSet.size(), greaterThanOrEqualTo(testEnginePlugin.waitMergesEnqueuedCount)); + }, 1, TimeUnit.MINUTES); + // finish up indexing + indexingDone.set(true); + for (Thread indexingThread : indexingThreads) { + indexingThread.join(); + } + // unblock merge threads + testEnginePlugin.allowAllMerging(); + // await all merging to catch up + assertBusy(() -> { + assertThat(testEnginePlugin.runningMergesSet.size(), is(0)); + assertThat(testEnginePlugin.enqueuedMergesSet.size(), is(0)); + testEnginePlugin.mergeExecutorServiceReference.get().allDone(); + }, 1, TimeUnit.MINUTES); + var segmentsCountAfterMergingCaughtUp = getSegmentsCountForAllShards("index"); + // force merge should be a noop after all available merging was done + assertAllSuccessful(indicesAdmin().prepareForceMerge("index").get()); + var segmentsCountAfterForceMerge = getSegmentsCountForAllShards("index"); + assertThat(segmentsCountAfterForceMerge, is(segmentsCountAfterMergingCaughtUp)); + // let's also run a force-merge to 1 segment + assertAllSuccessful(indicesAdmin().prepareForceMerge("index").setMaxNumSegments(1).get()); + assertAllSuccessful(indicesAdmin().prepareRefresh("index").get()); + // assert one segment per shard + { + IndicesSegmentResponse indicesSegmentResponse = indicesAdmin().prepareSegments("index").get(); + Iterator indexShardSegmentsIterator = indicesSegmentResponse.getIndices().get("index").iterator(); + while (indexShardSegmentsIterator.hasNext()) { + for (ShardSegments segments : indexShardSegmentsIterator.next()) { + assertThat(segments.getSegments().size(), is(1)); + } + } + } + } + + private int getSegmentsCountForAllShards(String indexName) { + // refresh, otherwise we'd be still seeing the old merged-away segments + assertAllSuccessful(indicesAdmin().prepareRefresh(indexName).get()); + int count = 0; + IndicesSegmentResponse indicesSegmentResponse = indicesAdmin().prepareSegments(indexName).get(); + Iterator indexShardSegmentsIterator = indicesSegmentResponse.getIndices().get(indexName).iterator(); + while (indexShardSegmentsIterator.hasNext()) { + for (ShardSegments segments : indexShardSegmentsIterator.next()) { + count += segments.getSegments().size(); + } + } + return count; + } + + private TestEnginePlugin getTestEnginePlugin() { + return getInstanceFromNode(PluginsService.class).filterPlugins(TestEnginePlugin.class).toList().get(0); + } +} diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java index 3338675160268..150262f98a10b 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java @@ -625,6 +625,7 @@ public static final IndexShard newIndexShard( indexService.getIndexEventListener(), wrapper, indexService.getThreadPool(), + indexService.getThreadPoolMergeExecutorService(), indexService.getBigArrays(), null, Collections.emptyList(), diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java index 74ccdce19d3ad..0ac8c4d0b6fd4 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java @@ -59,6 +59,7 @@ EngineConfig engineConfigWithLargerIndexingMemory(EngineConfig config) { return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), indexSettings, config.getWarmer(), config.getStore(), diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index aecc750bd4e39..dbc18468aa9bd 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -87,6 +87,7 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexingPressure; import org.elasticsearch.index.MergePolicyConfig; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.indices.IndexingMemoryController; import org.elasticsearch.indices.IndicesQueryCache; import org.elasticsearch.indices.IndicesRequestCache; @@ -619,6 +620,7 @@ public void apply(Settings value, Settings current, Settings previous) { TDigestExecutionHint.SETTING, MergePolicyConfig.DEFAULT_MAX_MERGED_SEGMENT_SETTING, MergePolicyConfig.DEFAULT_MAX_TIME_BASED_MERGED_SEGMENT_SETTING, + ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING, TransportService.ENABLE_STACK_OVERFLOW_AVOIDANCE, DataStreamGlobalRetentionSettings.DATA_STREAMS_DEFAULT_RETENTION_SETTING, DataStreamGlobalRetentionSettings.DATA_STREAMS_MAX_RETENTION_SETTING, diff --git a/server/src/main/java/org/elasticsearch/index/IndexModule.java b/server/src/main/java/org/elasticsearch/index/IndexModule.java index 7d63a0432cdbc..6cd63b3c0047d 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexModule.java +++ b/server/src/main/java/org/elasticsearch/index/IndexModule.java @@ -43,6 +43,7 @@ import org.elasticsearch.index.cache.query.QueryCache; import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.engine.EngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.MapperMetrics; import org.elasticsearch.index.mapper.MapperRegistry; @@ -470,6 +471,7 @@ public IndexService newIndexService( CircuitBreakerService circuitBreakerService, BigArrays bigArrays, ThreadPool threadPool, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, ScriptService scriptService, ClusterService clusterService, Client client, @@ -523,6 +525,7 @@ public IndexService newIndexService( circuitBreakerService, bigArrays, threadPool, + threadPoolMergeExecutorService, scriptService, clusterService, client, diff --git a/server/src/main/java/org/elasticsearch/index/IndexService.java b/server/src/main/java/org/elasticsearch/index/IndexService.java index baba9e94db7a7..3617ed3ff3beb 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexService.java +++ b/server/src/main/java/org/elasticsearch/index/IndexService.java @@ -49,6 +49,7 @@ import org.elasticsearch.index.cache.query.QueryCache; import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.engine.EngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldDataCache; @@ -154,6 +155,8 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust private final AsyncTrimTranslogTask trimTranslogTask; private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final BigArrays bigArrays; private final ScriptService scriptService; private final ClusterService clusterService; @@ -178,6 +181,7 @@ public IndexService( CircuitBreakerService circuitBreakerService, BigArrays bigArrays, ThreadPool threadPool, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, ScriptService scriptService, ClusterService clusterService, Client client, @@ -261,6 +265,7 @@ public IndexService( this.indexFoldersDeletionListener = indexFoldersDeletionListener; this.bigArrays = bigArrays; this.threadPool = threadPool; + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; this.scriptService = scriptService; this.clusterService = clusterService; this.client = client; @@ -556,6 +561,7 @@ public synchronized IndexShard createShard( eventListener, readerWrapper, threadPool, + threadPoolMergeExecutorService, bigArrays, engineWarmer, searchOperationListeners, @@ -820,6 +826,10 @@ public ThreadPool getThreadPool() { return threadPool; } + public @Nullable ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService() { + return threadPoolMergeExecutorService; + } + /** * The {@link BigArrays} to use for this index. */ diff --git a/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java b/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java index af3c2cd5172f6..1ef42cdb922c3 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java +++ b/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java @@ -58,6 +58,8 @@ public final class EngineConfig { private final MapperService mapperService; private final IndexStorePlugin.SnapshotCommitSupplier snapshotCommitSupplier; private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final Engine.Warmer warmer; private final Store store; private final MergePolicy mergePolicy; @@ -150,6 +152,7 @@ public Supplier retentionLeasesSupplier() { public EngineConfig( ShardId shardId, ThreadPool threadPool, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, IndexSettings indexSettings, Engine.Warmer warmer, Store store, @@ -179,6 +182,7 @@ public EngineConfig( this.shardId = shardId; this.indexSettings = indexSettings; this.threadPool = threadPool; + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; this.warmer = warmer == null ? (a) -> {} : warmer; this.store = store; this.mergePolicy = mergePolicy; @@ -287,6 +291,10 @@ public ThreadPool getThreadPool() { return threadPool; } + public @Nullable ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService() { + return threadPoolMergeExecutorService; + } + /** * Returns an {@link org.elasticsearch.index.engine.Engine.Warmer} used to warm new searchers before they are used for searching. */ diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index d113e60edb3dd..27448a0b2b4a2 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -254,7 +254,11 @@ public InternalEngine(EngineConfig engineConfig) { boolean success = false; try { this.lastDeleteVersionPruneTimeMSec = engineConfig.getThreadPool().relativeTimeInMillis(); - mergeScheduler = createMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings()); + mergeScheduler = createMergeScheduler( + engineConfig.getShardId(), + engineConfig.getIndexSettings(), + engineConfig.getThreadPoolMergeExecutorService() + ); scheduler = mergeScheduler.getMergeScheduler(); throttle = new IndexThrottle(); try { @@ -2818,15 +2822,95 @@ LiveIndexWriterConfig getCurrentIndexWriterConfig() { return indexWriter.getConfig(); } - protected ElasticsearchMergeScheduler createMergeScheduler(ShardId shardId, IndexSettings indexSettings) { - return new EngineMergeScheduler(shardId, indexSettings); + private void maybeFlushAfterMerge(OnGoingMerge merge) { + if (indexWriter.hasPendingMerges() == false && System.nanoTime() - lastWriteNanos >= engineConfig.getFlushMergesAfter().nanos()) { + // NEVER do this on a merge thread since we acquire some locks blocking here and if we concurrently rollback the + // writer + // we deadlock on engine#close for instance. + engineConfig.getThreadPool().executor(ThreadPool.Names.FLUSH).execute(new AbstractRunnable() { + @Override + public void onFailure(Exception e) { + if (isClosed.get() == false) { + logger.warn("failed to flush after merge has finished", e); + } else { + logger.info("failed to flush after merge has finished during shard close"); + } + } + + @Override + protected void doRun() { + // if we have no pending merges and we are supposed to flush once merges have finished to + // free up transient disk usage of the (presumably biggish) segments that were just merged + flush(); + } + }); + } else if (merge.getTotalBytesSize() >= engineConfig.getIndexSettings().getFlushAfterMergeThresholdSize().getBytes()) { + // we hit a significant merge which would allow us to free up memory if we'd commit it hence on the next change + // we should execute a flush on the next operation if that's a flush after inactive or indexing a document. + // we could fork a thread and do it right away but we try to minimize forking and piggyback on outside events. + shouldPeriodicallyFlushAfterBigMerge.set(true); + } + } + + protected ElasticsearchMergeScheduler createMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + @Nullable ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + if (threadPoolMergeExecutorService != null) { + return new EngineThreadPoolMergeScheduler(shardId, indexSettings, threadPoolMergeExecutorService); + } else { + return new EngineConcurrentMergeScheduler(shardId, indexSettings); + } + } + + private final class EngineThreadPoolMergeScheduler extends ThreadPoolMergeScheduler { + EngineThreadPoolMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + super(shardId, indexSettings, threadPoolMergeExecutorService); + } + + @Override + protected synchronized void enableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + logger.info( + "now throttling indexing: numRunningMerges={}, numQueuedMerges={}, maxNumMergesConfigured={}", + numRunningMerges, + numQueuedMerges, + configuredMaxMergeCount + ); + InternalEngine.this.activateThrottling(); + } + + @Override + protected synchronized void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + logger.info( + "stop throttling indexing: numRunningMerges={}, numQueuedMerges={}, maxNumMergesConfigured={}", + numRunningMerges, + numQueuedMerges, + configuredMaxMergeCount + ); + InternalEngine.this.deactivateThrottling(); + } + + @Override + public synchronized void afterMerge(OnGoingMerge merge) { + maybeFlushAfterMerge(merge); + } + + @Override + protected void handleMergeException(final Throwable exc) { + mergeException(exc); + } } - private final class EngineMergeScheduler extends ElasticsearchConcurrentMergeScheduler { + private final class EngineConcurrentMergeScheduler extends ElasticsearchConcurrentMergeScheduler { private final AtomicInteger numMergesInFlight = new AtomicInteger(0); private final AtomicBoolean isThrottling = new AtomicBoolean(); - EngineMergeScheduler(ShardId shardId, IndexSettings indexSettings) { + EngineConcurrentMergeScheduler(ShardId shardId, IndexSettings indexSettings) { super(shardId, indexSettings); } @@ -2850,33 +2934,7 @@ public synchronized void afterMerge(OnGoingMerge merge) { deactivateThrottling(); } } - if (indexWriter.hasPendingMerges() == false - && System.nanoTime() - lastWriteNanos >= engineConfig.getFlushMergesAfter().nanos()) { - // NEVER do this on a merge thread since we acquire some locks blocking here and if we concurrently rollback the writer - // we deadlock on engine#close for instance. - engineConfig.getThreadPool().executor(ThreadPool.Names.FLUSH).execute(new AbstractRunnable() { - @Override - public void onFailure(Exception e) { - if (isClosed.get() == false) { - logger.warn("failed to flush after merge has finished", e); - } else { - logger.info("failed to flush after merge has finished during shard close"); - } - } - - @Override - protected void doRun() { - // if we have no pending merges and we are supposed to flush once merges have finished to - // free up transient disk usage of the (presumably biggish) segments that were just merged - flush(); - } - }); - } else if (merge.getTotalBytesSize() >= engineConfig.getIndexSettings().getFlushAfterMergeThresholdSize().getBytes()) { - // we hit a significant merge which would allow us to free up memory if we'd commit it hence on the next change - // we should execute a flush on the next operation if that's a flush after inactive or indexing a document. - // we could fork a thread and do it right away but we try to minimize forking and piggyback on outside events. - shouldPeriodicallyFlushAfterBigMerge.set(true); - } + maybeFlushAfterMerge(merge); } @Override diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java new file mode 100644 index 0000000000000..5217edb5490dc --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java @@ -0,0 +1,304 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.threadpool.ThreadPool; + +import java.util.Comparator; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.LongUnaryOperator; + +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; + +public class ThreadPoolMergeExecutorService { + /** + * Floor for IO write rate limit of individual merge tasks (we will never go any lower than this) + */ + static final ByteSizeValue MIN_IO_RATE = ByteSizeValue.ofMb(5L); + /** + * Ceiling for IO write rate limit of individual merge tasks (we will never go any higher than this) + */ + static final ByteSizeValue MAX_IO_RATE = ByteSizeValue.ofMb(10240L); + /** + * Initial value for IO write rate limit of individual merge tasks when doAutoIOThrottle is true + */ + static final ByteSizeValue START_IO_RATE = ByteSizeValue.ofMb(20L); + /** + * Total number of submitted merge tasks that support IO auto throttling and that have not yet been run (or aborted). + * This includes merge tasks that are currently running and that are backlogged (by their respective merge schedulers). + */ + private final AtomicInteger ioThrottledMergeTasksCount = new AtomicInteger(); + /** + * The merge tasks that are waiting execution. This does NOT include backlogged or currently executing merge tasks. + * For instance, this can be empty while there are backlogged merge tasks awaiting re-enqueuing. + */ + private final PriorityBlockingQueue queuedMergeTasks = new PriorityBlockingQueue<>( + 64, + Comparator.comparingLong(MergeTask::estimatedMergeSize) + ); + /** + * The set of all merge tasks currently being executed by merge threads from the pool. + * These are tracked notably in order to be able to update their disk IO throttle rate, after they have started, while executing. + */ + private final Set runningMergeTasks = ConcurrentCollections.newConcurrentSet(); + /** + * Current IO write throttle rate, in bytes per sec, that's in effect for all currently running merge tasks, + * across all {@link ThreadPoolMergeScheduler}s that use this instance of the queue. + */ + private final AtomicIORate targetIORateBytesPerSec = new AtomicIORate(START_IO_RATE.getBytes()); + private final ExecutorService executorService; + /** + * The maximum number of concurrently running merges, given the number of threads in the pool. + */ + private final int maxConcurrentMerges; + private final int concurrentMergesFloorLimitForThrottling; + private final int concurrentMergesCeilLimitForThrottling; + + public static @Nullable ThreadPoolMergeExecutorService maybeCreateThreadPoolMergeExecutorService( + ThreadPool threadPool, + Settings settings + ) { + if (ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.get(settings)) { + return new ThreadPoolMergeExecutorService(threadPool); + } else { + return null; + } + } + + private ThreadPoolMergeExecutorService(ThreadPool threadPool) { + this.executorService = threadPool.executor(ThreadPool.Names.MERGE); + this.maxConcurrentMerges = threadPool.info(ThreadPool.Names.MERGE).getMax(); + this.concurrentMergesFloorLimitForThrottling = maxConcurrentMerges * 2; + this.concurrentMergesCeilLimitForThrottling = maxConcurrentMerges * 4; + } + + boolean submitMergeTask(MergeTask mergeTask) { + assert mergeTask.isRunning() == false; + // first enqueue the runnable that runs exactly one merge task (the smallest it can find) + if (enqueueMergeTaskExecution() == false) { + // if the thread pool cannot run the merge, just abort it + mergeTask.abort(); + return false; + } else { + if (mergeTask.supportsIOThrottling()) { + // count enqueued merge tasks that support IO auto throttling, and maybe adjust IO rate for all + int currentTaskCount = ioThrottledMergeTasksCount.incrementAndGet(); + targetIORateBytesPerSec.update( + currentTargetIORateBytesPerSec -> newTargetIORateBytesPerSec( + currentTargetIORateBytesPerSec, + currentTaskCount, + concurrentMergesFloorLimitForThrottling, + concurrentMergesCeilLimitForThrottling + ), + (prevTargetIORateBytesPerSec, newTargetIORateBytesPerSec) -> { + // it's OK to have this method update merge tasks concurrently, with different targetMBPerSec values, + // as it's not important that all merge tasks are throttled to the same IO rate at all time. + // For performance reasons, we don't synchronize the updates to targetMBPerSec values with the update of running + // merges. + if (prevTargetIORateBytesPerSec != newTargetIORateBytesPerSec) { + runningMergeTasks.forEach(runningMergeTask -> { + if (runningMergeTask.supportsIOThrottling()) { + runningMergeTask.setIORateLimit(newTargetIORateBytesPerSec); + } + }); + } + } + ); + } + // then enqueue the merge task proper + queuedMergeTasks.add(mergeTask); + return true; + } + } + + void reEnqueueBackloggedMergeTask(MergeTask mergeTask) { + queuedMergeTasks.add(mergeTask); + } + + public boolean allDone() { + return queuedMergeTasks.isEmpty() && runningMergeTasks.isEmpty() && ioThrottledMergeTasksCount.get() == 0L; + } + + /** + * Enqueues a runnable that executes exactly one merge task, the smallest that is runnable at some point in time. + * A merge task is not runnable if its scheduler already reached the configured max-allowed concurrency level. + */ + private boolean enqueueMergeTaskExecution() { + try { + executorService.execute(() -> { + // one such runnable always executes a SINGLE merge task from the queue + // this is important for merge queue statistics, i.e. the executor's queue size represents the current amount of merges + while (true) { + MergeTask smallestMergeTask; + try { + // will block if there are backlogged merges until they're enqueued again + smallestMergeTask = queuedMergeTasks.take(); + } catch (InterruptedException e) { + // An active worker thread has been interrupted while waiting for backlogged merges to be re-enqueued. + // In this case, we terminate the worker thread promptly and forget about the backlogged merges. + // It is OK to forget about merges in this case, because active worker threads are only interrupted + // when the node is shutting down, in which case in-memory accounting of merging activity is not relevant. + // As part of {@link java.util.concurrent.ThreadPoolExecutor#shutdownNow()} the thread pool's work queue + // is also drained, so any queued merge tasks are also forgotten. + break; + } + // let the task's scheduler decide if it can actually run the merge task now + ThreadPoolMergeScheduler.Schedule schedule = smallestMergeTask.schedule(); + if (schedule == RUN) { + runMergeTask(smallestMergeTask); + break; + } else if (schedule == ABORT) { + abortMergeTask(smallestMergeTask); + break; + } else { + assert schedule == BACKLOG; + // the merge task is backlogged by the merge scheduler, try to get the next smallest one + // it's then the duty of the said merge scheduler to re-enqueue the backlogged merge task when it can be run + } + } + }); + return true; + } catch (Throwable t) { + // cannot execute merges because the executor is shutting down + assert t instanceof RejectedExecutionException; + return false; + } + } + + private void runMergeTask(MergeTask mergeTask) { + assert mergeTask.isRunning() == false; + boolean added = runningMergeTasks.add(mergeTask); + assert added : "starting merge task [" + mergeTask + "] registered as already running"; + try { + if (mergeTask.supportsIOThrottling()) { + mergeTask.setIORateLimit(targetIORateBytesPerSec.get()); + } + mergeTask.run(); + } finally { + boolean removed = runningMergeTasks.remove(mergeTask); + assert removed : "completed merge task [" + mergeTask + "] not registered as running"; + if (mergeTask.supportsIOThrottling()) { + ioThrottledMergeTasksCount.decrementAndGet(); + } + } + } + + private void abortMergeTask(MergeTask mergeTask) { + assert mergeTask.isRunning() == false; + assert runningMergeTasks.contains(mergeTask) == false; + try { + mergeTask.abort(); + } finally { + if (mergeTask.supportsIOThrottling()) { + ioThrottledMergeTasksCount.decrementAndGet(); + } + } + } + + private static long newTargetIORateBytesPerSec( + long currentTargetIORateBytesPerSec, + int currentlySubmittedIOThrottledMergeTasks, + int concurrentMergesFloorLimitForThrottling, + int concurrentMergesCeilLimitForThrottling + ) { + final long newTargetIORateBytesPerSec; + if (currentlySubmittedIOThrottledMergeTasks < concurrentMergesFloorLimitForThrottling + && currentTargetIORateBytesPerSec > MIN_IO_RATE.getBytes()) { + // decrease target IO rate by 10% (capped) + newTargetIORateBytesPerSec = Math.max( + MIN_IO_RATE.getBytes(), + currentTargetIORateBytesPerSec - currentTargetIORateBytesPerSec / 10L + ); + } else if (currentlySubmittedIOThrottledMergeTasks > concurrentMergesCeilLimitForThrottling + && currentTargetIORateBytesPerSec < MAX_IO_RATE.getBytes()) { + // increase target IO rate by 10% (capped) + newTargetIORateBytesPerSec = Math.min( + MAX_IO_RATE.getBytes(), + currentTargetIORateBytesPerSec + currentTargetIORateBytesPerSec / 10L + ); + } else { + newTargetIORateBytesPerSec = currentTargetIORateBytesPerSec; + } + return newTargetIORateBytesPerSec; + } + + static class AtomicIORate { + private final AtomicLong ioRate; + + AtomicIORate(long initialIORate) { + ioRate = new AtomicLong(initialIORate); + } + + long get() { + return ioRate.get(); + } + + // Exactly like {@link AtomicLong#updateAndGet} but calls the consumer rather than return the new (updated) value. + // The consumer receives both the previous and the updated values (which can be equal). + void update(LongUnaryOperator updateFunction, AtomicIORate.UpdateConsumer updateConsumer) { + long prev = ioRate.get(), next = 0L; + for (boolean haveNext = false;;) { + if (haveNext == false) next = updateFunction.applyAsLong(prev); + if (ioRate.weakCompareAndSetVolatile(prev, next)) { + updateConsumer.accept(prev, next); + return; + } + haveNext = (prev == (prev = ioRate.get())); + } + } + + @FunctionalInterface + interface UpdateConsumer { + void accept(long prev, long next); + } + } + + // exposed for tests + Set getRunningMergeTasks() { + return runningMergeTasks; + } + + // exposed for tests + PriorityBlockingQueue getQueuedMergeTasks() { + return queuedMergeTasks; + } + + // exposed for tests and stats + long getTargetIORateBytesPerSec() { + return targetIORateBytesPerSec.get(); + } + + // exposed for tests + int getMaxConcurrentMerges() { + return maxConcurrentMerges; + } + + // exposed for tests + int getConcurrentMergesFloorLimitForThrottling() { + return concurrentMergesFloorLimitForThrottling; + } + + // exposed for tests + int getConcurrentMergesCeilLimitForThrottling() { + return concurrentMergesCeilLimitForThrottling; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java new file mode 100644 index 0000000000000..8cfdc59268365 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java @@ -0,0 +1,529 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.logging.log4j.Logger; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergeRateLimiter; +import org.apache.lucene.index.MergeScheduler; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FilterDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RateLimitedIndexOutput; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.MergeSchedulerConfig; +import org.elasticsearch.index.merge.MergeStats; +import org.elasticsearch.index.merge.OnGoingMerge; +import org.elasticsearch.index.shard.ShardId; + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; + +public class ThreadPoolMergeScheduler extends MergeScheduler implements ElasticsearchMergeScheduler { + public static final Setting USE_THREAD_POOL_MERGE_SCHEDULER_SETTING = Setting.boolSetting( + "indices.merge.scheduler.use_thread_pool", + true, + Setting.Property.NodeScope + ); + private final ShardId shardId; + private final MergeSchedulerConfig config; + private final Logger logger; + private final MergeTracking mergeTracking; + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; + private final PriorityQueue backloggedMergeTasks = new PriorityQueue<>( + 16, + Comparator.comparingLong(MergeTask::estimatedMergeSize) + ); + private final Map runningMergeTasks = new HashMap<>(); + // set when incoming merges should be throttled (i.e. restrict the indexing rate) + private final AtomicBoolean shouldThrottleIncomingMerges = new AtomicBoolean(); + // how many {@link MergeTask}s have kicked off (this is used to name them). + private final AtomicLong submittedMergeTaskCount = new AtomicLong(); + private final AtomicLong doneMergeTaskCount = new AtomicLong(); + private final CountDownLatch closedWithNoRunningMerges = new CountDownLatch(1); + private volatile boolean closed = false; + + public ThreadPoolMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + this.shardId = shardId; + this.config = indexSettings.getMergeSchedulerConfig(); + this.logger = Loggers.getLogger(getClass(), shardId); + this.mergeTracking = new MergeTracking( + logger, + () -> this.config.isAutoThrottle() + ? ByteSizeValue.ofBytes(threadPoolMergeExecutorService.getTargetIORateBytesPerSec()).getMbFrac() + : Double.POSITIVE_INFINITY + ); + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; + } + + @Override + public Set onGoingMerges() { + return mergeTracking.onGoingMerges(); + } + + @Override + public MergeStats stats() { + return mergeTracking.stats(); + } + + @Override + public MergeScheduler getMergeScheduler() { + return this; + } + + @Override + public void refreshConfig() { + // if maxMergeCount changed, maybe we need to toggle merge task throttling + checkMergeTaskThrottling(); + // if maxThreadCount changed, maybe some backlogged merges are now allowed to run + enqueueBackloggedTasks(); + } + + @Override + public void merge(MergeSource mergeSource, MergeTrigger trigger) { + if (closed) { + // avoid pulling from the merge source when closing + return; + } + MergePolicy.OneMerge merge = null; + try { + merge = mergeSource.getNextMerge(); + } catch (IllegalStateException e) { + if (verbose()) { + message("merge task poll failed, likely that index writer is failed"); + } + // ignore exception, we expect the IW failure to be logged elsewhere + } + if (merge != null) { + submitNewMergeTask(mergeSource, merge, trigger); + } + } + + @Override + public MergeScheduler clone() { + // Lucene IW makes a clone internally but since we hold on to this instance + // the clone will just be the identity. + return this; + } + + /** + * A callback allowing for custom logic before an actual merge starts. + */ + protected void beforeMerge(OnGoingMerge merge) {} + + /** + * A callback allowing for custom logic after an actual merge starts. + */ + protected void afterMerge(OnGoingMerge merge) {} + + /** + * A callback that's invoked when indexing should throttle down indexing in order to let merging to catch up. + */ + protected void enableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) {} + + /** + * A callback that's invoked when indexing should un-throttle because merging caught up. + * This is invoked sometime after {@link #enableIndexingThrottling(int, int, int)} was invoked in the first place. + */ + protected void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) {} + + /** + * A callback for exceptions thrown while merging. + */ + protected void handleMergeException(Throwable t) { + throw new MergePolicy.MergeException(t); + } + + // package-private for tests + boolean submitNewMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, MergeTrigger mergeTrigger) { + try { + MergeTask mergeTask = newMergeTask(mergeSource, merge, mergeTrigger); + return threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } finally { + checkMergeTaskThrottling(); + } + } + + // package-private for tests + MergeTask newMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, MergeTrigger mergeTrigger) { + // forced merges, as well as merges triggered when closing a shard, always run un-IO-throttled + boolean isAutoThrottle = mergeTrigger != MergeTrigger.CLOSING && merge.getStoreMergeInfo().mergeMaxNumSegments() == -1; + // IO throttling cannot be toggled for existing merge tasks, only new merge tasks pick up the updated IO throttling setting + return new MergeTask( + mergeSource, + merge, + isAutoThrottle && config.isAutoThrottle(), + "Lucene Merge Task #" + submittedMergeTaskCount.incrementAndGet() + " for shard " + shardId + ); + } + + private void checkMergeTaskThrottling() { + long submittedMergesCount = submittedMergeTaskCount.get(); + long doneMergesCount = doneMergeTaskCount.get(); + int runningMergesCount = runningMergeTasks.size(); + int configuredMaxMergeCount = config.getMaxMergeCount(); + // both currently running and enqueued merge tasks are considered "active" for throttling purposes + int activeMerges = (int) (submittedMergesCount - doneMergesCount); + if (activeMerges > configuredMaxMergeCount && shouldThrottleIncomingMerges.get() == false) { + // maybe enable merge task throttling + synchronized (shouldThrottleIncomingMerges) { + if (shouldThrottleIncomingMerges.getAndSet(true) == false) { + enableIndexingThrottling(runningMergesCount, activeMerges - runningMergesCount, configuredMaxMergeCount); + } + } + } else if (activeMerges <= configuredMaxMergeCount && shouldThrottleIncomingMerges.get()) { + // maybe disable merge task throttling + synchronized (shouldThrottleIncomingMerges) { + if (shouldThrottleIncomingMerges.getAndSet(false)) { + disableIndexingThrottling(runningMergesCount, activeMerges - runningMergesCount, configuredMaxMergeCount); + } + } + } + } + + // exposed for tests + // synchronized so that {@code #closed}, {@code #runningMergeTasks} and {@code #backloggedMergeTasks} are modified atomically + synchronized Schedule schedule(MergeTask mergeTask) { + assert mergeTask.isRunning() == false; + if (closed) { + // do not run or backlog tasks when closing the merge scheduler, instead abort them + return Schedule.ABORT; + } else if (runningMergeTasks.size() < config.getMaxThreadCount()) { + boolean added = runningMergeTasks.put(mergeTask.onGoingMerge.getMerge(), mergeTask) == null; + assert added : "starting merge task [" + mergeTask + "] registered as already running"; + return Schedule.RUN; + } else { + backloggedMergeTasks.add(mergeTask); + return Schedule.BACKLOG; + } + } + + // exposed for tests + synchronized void mergeTaskFinishedRunning(MergeTask mergeTask) { + boolean removed = runningMergeTasks.remove(mergeTask.onGoingMerge.getMerge()) != null; + assert removed : "completed merge task [" + mergeTask + "] not registered as running"; + // when one merge is done, maybe a backlogged one can now execute + enqueueBackloggedTasks(); + // signal here, because, when closing, we wait for all currently running merges to finish + maybeSignalAllMergesDoneAfterClose(); + } + + private void mergeTaskDone() { + doneMergeTaskCount.incrementAndGet(); + checkMergeTaskThrottling(); + } + + private synchronized void maybeSignalAllMergesDoneAfterClose() { + if (closed && runningMergeTasks.isEmpty()) { + closedWithNoRunningMerges.countDown(); + } + } + + private synchronized void enqueueBackloggedTasks() { + int maxBackloggedTasksToEnqueue = config.getMaxThreadCount() - runningMergeTasks.size(); + // enqueue all backlogged tasks when closing, as the queue expects all backlogged tasks to always be enqueued back + while (closed || maxBackloggedTasksToEnqueue-- > 0) { + MergeTask backloggedMergeTask = backloggedMergeTasks.poll(); + if (backloggedMergeTask == null) { + break; + } + // no need to abort merge tasks now, they will be aborted on the spot when the scheduler gets to run them + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backloggedMergeTask); + } + } + + /** + * Does the actual merge, by calling {@link org.apache.lucene.index.MergeScheduler.MergeSource#merge} + */ + void doMerge(MergeSource mergeSource, MergePolicy.OneMerge oneMerge) { + try { + mergeSource.merge(oneMerge); + } catch (Throwable t) { + // OK to ignore MergeAbortedException. This is what Lucene's ConcurrentMergeScheduler does. + if (t instanceof MergePolicy.MergeAbortedException == false) { + handleMergeException(t); + } + } + } + + @Override + public Directory wrapForMerge(MergePolicy.OneMerge merge, Directory in) { + // Return a wrapped Directory which has rate-limited output. + // Note: the rate limiter is only per thread (per merge). So, if there are multiple merge threads running + // the combined IO rate per node is, roughly, 'thread_pool_size * merge_queue#targetMBPerSec', as + // the per-thread IO rate is updated, best effort, for all running merge threads concomitantly. + if (merge.isAborted()) { + // merges can theoretically be aborted at any moment + return in; + } + MergeTask mergeTask = runningMergeTasks.get(merge); + if (mergeTask == null) { + throw new IllegalStateException("associated merge task for executing merge not found"); + } + return new FilterDirectory(in) { + @Override + public IndexOutput createOutput(String name, IOContext context) throws IOException { + ensureOpen(); + + // This Directory is only supposed to be used during merging, + // so all writes should have MERGE context, else there is a bug + // somewhere that is failing to pass down the right IOContext: + assert context.context() == IOContext.Context.MERGE : "got context=" + context.context(); + + return new RateLimitedIndexOutput(mergeTask.rateLimiter, in.createOutput(name, context)); + } + }; + } + + class MergeTask implements Runnable { + private final String name; + private final AtomicLong mergeStartTimeNS; + private final MergeSource mergeSource; + private final OnGoingMerge onGoingMerge; + private final MergeRateLimiter rateLimiter; + private final boolean supportsIOThrottling; + + MergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, boolean supportsIOThrottling, String name) { + this.name = name; + this.mergeStartTimeNS = new AtomicLong(); + this.mergeSource = mergeSource; + this.onGoingMerge = new OnGoingMerge(merge); + this.rateLimiter = new MergeRateLimiter(merge.getMergeProgress()); + this.supportsIOThrottling = supportsIOThrottling; + } + + Schedule schedule() { + return ThreadPoolMergeScheduler.this.schedule(this); + } + + public boolean supportsIOThrottling() { + return supportsIOThrottling; + } + + public void setIORateLimit(long ioRateLimitBytesPerSec) { + if (supportsIOThrottling == false) { + throw new IllegalArgumentException("merge task cannot be IO throttled"); + } + this.rateLimiter.setMBPerSec(ByteSizeValue.ofBytes(ioRateLimitBytesPerSec).getMbFrac()); + } + + public boolean isRunning() { + return mergeStartTimeNS.get() > 0L; + } + + /** + * Runs the merge associated to this task. MUST be invoked after {@link #schedule()} returned {@link Schedule#RUN}, + * to confirm that the associated {@link MergeScheduler} assents to run the merge. + * Either one of {@link #run()} or {@link #abort()} MUST be invoked exactly once for evey {@link MergeTask}. + * After the merge is finished, this will also submit any follow-up merges from the task's merge source. + */ + @Override + public void run() { + assert isRunning() == false; + assert ThreadPoolMergeScheduler.this.runningMergeTasks.containsKey(onGoingMerge.getMerge()) + : "runNowOrBacklog must be invoked before actually running the merge task"; + try { + beforeMerge(onGoingMerge); + try { + if (mergeStartTimeNS.compareAndSet(0L, System.nanoTime()) == false) { + throw new IllegalStateException("The merge task is already started or aborted"); + } + mergeTracking.mergeStarted(onGoingMerge); + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s start", this)); + } + try { + doMerge(mergeSource, onGoingMerge.getMerge()); + if (verbose()) { + message( + String.format( + Locale.ROOT, + "merge task %s merge segment [%s] done estSize=%.1f MB (written=%.1f MB) " + + "runTime=%.1fs (stopped=%.1fs, paused=%.1fs) rate=%s", + this, + getSegmentName(onGoingMerge.getMerge()), + bytesToMB(onGoingMerge.getMerge().estimatedMergeBytes), + bytesToMB(rateLimiter.getTotalBytesWritten()), + nsToSec(System.nanoTime() - mergeStartTimeNS.get()), + nsToSec(rateLimiter.getTotalStoppedNS()), + nsToSec(rateLimiter.getTotalPausedNS()), + rateToString(rateLimiter.getMBPerSec()) + ) + ); + } + } finally { + long tookMS = TimeValue.nsecToMSec(System.nanoTime() - mergeStartTimeNS.get()); + mergeTracking.mergeFinished(onGoingMerge.getMerge(), onGoingMerge, tookMS); + } + } finally { + afterMerge(onGoingMerge); + } + } finally { + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s end", this)); + } + try { + mergeTaskFinishedRunning(this); + } finally { + mergeTaskDone(); + } + try { + // kick-off any follow-up merge + merge(mergeSource, MergeTrigger.MERGE_FINISHED); + } catch (@SuppressWarnings("unused") AlreadyClosedException ace) { + // OK, this is what the {@code ConcurrentMergeScheduler} does + } + } + } + + /** + * Aborts the merge task, for e.g. when the {@link MergeScheduler}, or the + * {@link ThreadPoolMergeExecutorService} are closing. Either one of {@link #run()} or {@link #abort()} + * MUST be invoked exactly once for evey {@link MergeTask}. + * An aborted merge means that the segments involved will be made available + * (by the {@link org.apache.lucene.index.IndexWriter}) to any subsequent merges. + */ + void abort() { + assert isRunning() == false; + assert ThreadPoolMergeScheduler.this.runningMergeTasks.containsKey(onGoingMerge.getMerge()) == false + : "cannot abort a merge task that's already running"; + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s aborted", this)); + } + // {@code IndexWriter} checks the abort flag internally, while running the merge. + // The segments of an aborted merge become available to subsequent merges. + onGoingMerge.getMerge().setAborted(); + try { + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s start abort", this)); + } + // mark the merge task as running, even though the merge itself is aborted and the task will run for a brief time only + if (mergeStartTimeNS.compareAndSet(0L, System.nanoTime()) == false) { + throw new IllegalStateException("The merge task is already started or aborted"); + } + // This ensures {@code OneMerge#close} gets invoked. + // {@code IndexWriter} considers a merge as "running" once it has been pulled from the {@code MergeSource#getNextMerge}, + // so in theory it's not enough to just call {@code MergeSource#onMergeFinished} on it (as for "pending" ones). + doMerge(mergeSource, onGoingMerge.getMerge()); + } finally { + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s end abort", this)); + } + mergeTaskDone(); + } + } + + long estimatedMergeSize() { + // TODO is it possible that `estimatedMergeBytes` be `0` for correctly initialize merges, + // or is it always the case that if `estimatedMergeBytes` is `0` that means that the merge has not yet been initialized? + return onGoingMerge.getMerge().getStoreMergeInfo().estimatedMergeBytes(); + } + + @Override + public String toString() { + return name + (onGoingMerge.getMerge().isAborted() ? " (aborted)" : ""); + } + } + + @Override + /* Overridden to route messages to our logger too, in addition to the {@link org.apache.lucene.util.InfoStream} that lucene uses. */ + protected boolean verbose() { + if (logger.isTraceEnabled()) { + return true; + } + return super.verbose(); + } + + @Override + /* Overridden to route messages to our logger too, in addition to the {@link org.apache.lucene.util.InfoStream} that lucene uses. */ + protected void message(String message) { + if (logger.isTraceEnabled()) { + logger.trace("{}", message); + } + super.message(message); + } + + @Override + public void close() throws IOException { + synchronized (this) { + closed = true; + // enqueue any backlogged merge tasks, because the merge queue assumes that the backlogged tasks are always re-enqueued + enqueueBackloggedTasks(); + // signal if there aren't any currently running merges + maybeSignalAllMergesDoneAfterClose(); + } + try { + closedWithNoRunningMerges.await(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } finally { + // this closes an executor that may be used by ongoing merges, so better close it only after all running merges finished + super.close(); + } + } + + // exposed for tests + PriorityQueue getBackloggedMergeTasks() { + return backloggedMergeTasks; + } + + // exposed for tests + Map getRunningMergeTasks() { + return runningMergeTasks; + } + + private static double nsToSec(long ns) { + return ns / (double) TimeUnit.SECONDS.toNanos(1); + } + + private static double bytesToMB(long bytes) { + return bytes / 1024. / 1024.; + } + + private static String getSegmentName(MergePolicy.OneMerge merge) { + return merge.getMergeInfo() != null ? merge.getMergeInfo().info.name : "_na_"; + } + + private static String rateToString(double mbPerSec) { + if (mbPerSec == 0.0) { + return "stopped"; + } else if (mbPerSec == Double.POSITIVE_INFINITY) { + return "unlimited"; + } else { + return String.format(Locale.ROOT, "%.1f MB/sec", mbPerSec); + } + } + + enum Schedule { + ABORT, + RUN, + BACKLOG + } +} diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java index e67debfe443bf..ed31a74ac326b 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java @@ -96,6 +96,7 @@ import org.elasticsearch.index.engine.SafeCommitInfo; import org.elasticsearch.index.engine.Segment; import org.elasticsearch.index.engine.SegmentsStats; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.fielddata.FieldDataStats; import org.elasticsearch.index.fielddata.ShardFieldData; import org.elasticsearch.index.flush.FlushStats; @@ -195,6 +196,8 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesClusterStateService.Shard { private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final MapperService mapperService; private final IndexCache indexCache; private final Store store; @@ -318,6 +321,7 @@ public IndexShard( final IndexEventListener indexEventListener, final CheckedFunction indexReaderWrapper, final ThreadPool threadPool, + final ThreadPoolMergeExecutorService threadPoolMergeExecutorService, final BigArrays bigArrays, final Engine.Warmer warmer, final List searchOperationListener, @@ -344,6 +348,7 @@ public IndexShard( this.indexSortSupplier = indexSortSupplier; this.indexEventListener = indexEventListener; this.threadPool = threadPool; + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; this.mapperService = mapperService; this.indexCache = indexCache; this.internalIndexingStats = new InternalIndexingStats(); @@ -3559,6 +3564,7 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) { return new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, indexSettings, warmer, store, diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesService.java b/server/src/main/java/org/elasticsearch/indices/IndicesService.java index 1df5bddeff9e3..64fa709512bdf 100644 --- a/server/src/main/java/org/elasticsearch/indices/IndicesService.java +++ b/server/src/main/java/org/elasticsearch/indices/IndicesService.java @@ -98,6 +98,7 @@ import org.elasticsearch.index.engine.InternalEngineFactory; import org.elasticsearch.index.engine.NoOpEngine; import org.elasticsearch.index.engine.ReadOnlyEngine; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.flush.FlushStats; import org.elasticsearch.index.get.GetStats; @@ -231,6 +232,8 @@ public class IndicesService extends AbstractLifecycleComponent private final IndicesFieldDataCache indicesFieldDataCache; private final CacheCleaner cacheCleaner; private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final CircuitBreakerService circuitBreakerService; private final BigArrays bigArrays; private final ScriptService scriptService; @@ -286,6 +289,10 @@ protected void doStart() { IndicesService(IndicesServiceBuilder builder) { this.settings = builder.settings; this.threadPool = builder.threadPool; + this.threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + settings + ); this.pluginsService = builder.pluginsService; this.nodeEnv = builder.nodeEnv; this.parserConfig = XContentParserConfiguration.EMPTY.withDeprecationHandler(LoggingDeprecationHandler.INSTANCE) @@ -781,6 +788,7 @@ private synchronized IndexService createIndexService( circuitBreakerService, bigArrays, threadPool, + threadPoolMergeExecutorService, scriptService, clusterService, client, @@ -1906,4 +1914,9 @@ public IndexScopedSettings getIndexScopedSettings() { public BigArrays getBigArrays() { return bigArrays; } + + @Nullable + public ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService() { + return threadPoolMergeExecutorService; + } } diff --git a/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java b/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java index 9698ce6b65cdf..b8dddc20cc51d 100644 --- a/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java +++ b/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java @@ -13,6 +13,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.threadpool.internal.BuiltInExecutorBuilders; import java.util.HashMap; @@ -141,6 +142,12 @@ public Map getBuilders(Settings settings, int allocated false ) ); + if (ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.get(settings)) { + result.put( + ThreadPool.Names.MERGE, + new ScalingExecutorBuilder(ThreadPool.Names.MERGE, 1, allocatedProcessors, TimeValue.timeValueMinutes(5), true) + ); + } result.put( ThreadPool.Names.FORCE_MERGE, new FixedExecutorBuilder( diff --git a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java index 96d82793a3f4f..85ee02b6db856 100644 --- a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java +++ b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java @@ -134,6 +134,7 @@ public static class Names { public static final String WARMER = "warmer"; public static final String SNAPSHOT = "snapshot"; public static final String SNAPSHOT_META = "snapshot_meta"; + public static final String MERGE = "merge"; public static final String FORCE_MERGE = "force_merge"; public static final String FETCH_SHARD_STARTED = "fetch_shard_started"; public static final String FETCH_SHARD_STORE = "fetch_shard_store"; @@ -192,6 +193,7 @@ public static ThreadPoolType fromType(String type) { entry(Names.WARMER, ThreadPoolType.SCALING), entry(Names.SNAPSHOT, ThreadPoolType.SCALING), entry(Names.SNAPSHOT_META, ThreadPoolType.SCALING), + entry(Names.MERGE, ThreadPoolType.SCALING), entry(Names.FORCE_MERGE, ThreadPoolType.FIXED), entry(Names.FETCH_SHARD_STARTED, ThreadPoolType.SCALING), entry(Names.FETCH_SHARD_STORE, ThreadPoolType.SCALING), diff --git a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java index c519d4834148d..cf1b05bc29630 100644 --- a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -59,6 +59,8 @@ import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngine; import org.elasticsearch.index.engine.InternalEngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.mapper.MapperMetrics; import org.elasticsearch.index.mapper.MapperRegistry; @@ -158,6 +160,7 @@ public void beforeShardFoldersDeleted(ShardId shardId, IndexSettings indexSettin }; private MapperRegistry mapperRegistry; private ThreadPool threadPool; + private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private CircuitBreakerService circuitBreakerService; private BigArrays bigArrays; private ScriptService scriptService; @@ -170,6 +173,7 @@ public void setUp() throws Exception { settings = Settings.builder() .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) .build(); indicesQueryCache = new IndicesQueryCache(settings); indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); @@ -188,6 +192,7 @@ public void setUp() throws Exception { emptyMap() ); threadPool = new TestThreadPool("test"); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); circuitBreakerService = new NoneCircuitBreakerService(); PageCacheRecycler pageCacheRecycler = new PageCacheRecycler(settings); bigArrays = new BigArrays(pageCacheRecycler, circuitBreakerService, CircuitBreaker.REQUEST); @@ -214,6 +219,7 @@ private IndexService newIndexService(IndexModule module) throws IOException { circuitBreakerService, bigArrays, threadPool, + threadPoolMergeExecutorService, scriptService, clusterService, null, diff --git a/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java b/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java index 26de6a7897786..3f49d430a9945 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java @@ -2578,10 +2578,10 @@ public boolean mergeCompleted() { public void append(LogEvent event) { final String formattedMessage = event.getMessage().getFormattedMessage(); if (event.getLevel() == Level.TRACE && event.getMarker().getName().contains("[index][0]")) { - if (formattedMessage.startsWith("merge thread")) { + if (formattedMessage.startsWith("merge task")) { messages.add(formattedMessage); } else if (event.getLoggerName().endsWith(".MS") - && formattedMessage.contains("MS: merge thread") + && formattedMessage.contains("MS: merge task") && formattedMessage.endsWith("end")) { luceneMergeSchedulerEnded.set(true); } @@ -2616,14 +2616,14 @@ public void testMergeThreadLogging() throws Exception { }); assertBusy(() -> { - List threadMsgs = mockAppender.messages().stream().filter(line -> line.startsWith("merge thread")).toList(); + List threadMsgs = mockAppender.messages().stream().filter(line -> line.startsWith("merge task")).toList(); assertThat("messages:" + threadMsgs, threadMsgs.size(), greaterThanOrEqualTo(3)); assertThat( threadMsgs, containsInRelativeOrder( - matchesRegex("^merge thread .* start$"), - matchesRegex("^merge thread .* merge segment.*$"), - matchesRegex("^merge thread .* end$") + matchesRegex("^merge task .* start$"), + matchesRegex("^merge task .* merge segment.*$"), + matchesRegex("^merge task .* end$") ) ); assertThat(mockAppender.mergeCompleted(), is(true)); @@ -3587,6 +3587,7 @@ public void testRecoverFromForeignTranslog() throws IOException { EngineConfig brokenConfig = new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, config.getIndexSettings(), null, store, @@ -7149,6 +7150,7 @@ public void testNotWarmUpSearcherInEngineCtor() throws Exception { EngineConfig configWithWarmer = new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), warmer, store, diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java new file mode 100644 index 0000000000000..0a99c5002d5ad --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java @@ -0,0 +1,690 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.mockito.ArgumentCaptor; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Semaphore; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; + +import static org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.MAX_IO_RATE; +import static org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.MIN_IO_RATE; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; +import static org.hamcrest.Matchers.either; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class ThreadPoolMergeExecutorServiceTests extends ESTestCase { + + public void testNewMergeTaskIsAbortedWhenThreadPoolIsShutdown() { + TestThreadPool testThreadPool = new TestThreadPool("test"); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + // shutdown the thread pool + testThreadPool.shutdown(); + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + assertFalse(threadPoolMergeExecutorService.submitMergeTask(mergeTask)); + verify(mergeTask).abort(); + verify(mergeTask, times(0)).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(1)).abort(); + assertTrue(threadPoolMergeExecutorService.allDone()); + } + + public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutdown() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + // more merges than threads so that some are enqueued + int mergesToSubmit = mergeExecutorThreadCount + randomIntBetween(1, 5); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + TestThreadPool testThreadPool = new TestThreadPool("test", settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + Semaphore runMergeSemaphore = new Semaphore(0); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + AtomicInteger doneMergesCount = new AtomicInteger(0); + // submit more merge tasks than there are threads so that some are enqueued + for (int i = 0; i < mergesToSubmit; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + Schedule runOrAbort = randomFrom(RUN, ABORT); + doAnswer(mock -> { + // merges can be backlogged, but will be re-enqueued + Schedule schedule = randomFrom(BACKLOG, runOrAbort); + if (schedule == BACKLOG) { + // reenqueue backlogged merge task + new Thread(() -> threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask)).start(); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + // wait to be signalled before completing + if (runOrAbort == ABORT) { + fail("merge task ran but it should've aborted instead"); + } + runMergeSemaphore.acquireUninterruptibly(); + doneMergesCount.incrementAndGet(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing + if (runOrAbort == RUN) { + fail("merge task aborted but it should've ran instead"); + } + runMergeSemaphore.acquireUninterruptibly(); + doneMergesCount.incrementAndGet(); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + // assert merges are running and enqueued + assertBusy(() -> { + // assert that there are merge tasks running concurrently at the max allowed concurrency rate + assertThat(threadPoolExecutor.getActiveCount(), is(mergeExecutorThreadCount)); + // with the other merge tasks enqueued + assertThat(threadPoolExecutor.getQueue().size(), is(mergesToSubmit - mergeExecutorThreadCount)); + }); + // shutdown prevents new merge tasks to be enqueued but existing ones should be allowed to continue + testThreadPool.shutdown(); + // assert all executors, except the merge one, are terminated + for (String executorName : ThreadPool.THREAD_POOL_TYPES.keySet()) { + assertTrue(testThreadPool.executor(executorName).isShutdown()); + if (ThreadPool.Names.MERGE.equals(executorName)) { + assertFalse(testThreadPool.executor(executorName).isTerminated()); + } else { + assertTrue(testThreadPool.executor(executorName).isTerminated()); + } + } + for (int i = 0; i < mergesToSubmit; i++) { + // closing the thread pool is delayed because there are running and/or enqueued merge tasks + assertFalse(testThreadPool.awaitTermination(1, TimeUnit.NANOSECONDS)); + assertTrue(threadPoolExecutor.isShutdown()); + assertFalse(threadPoolExecutor.isTerminated()); + // let merges run one by one and check thread pool + runMergeSemaphore.release(); + int completedMergesCount = i + 1; + assertBusy(() -> { + assertThat(doneMergesCount.get(), is(completedMergesCount)); + assertThat(threadPoolExecutor.getCompletedTaskCount(), is((long) completedMergesCount)); + // active threads still working on the remaining merges + assertThat( + threadPoolExecutor.getActiveCount(), + is(Math.min(mergeExecutorThreadCount, mergesToSubmit - completedMergesCount)) + ); + // with any of the other merges still enqueued + assertThat( + threadPoolExecutor.getQueue().size(), + is(Math.max(mergesToSubmit - mergeExecutorThreadCount - completedMergesCount, 0)) + ); + }); + } + assertBusy(() -> { + assertTrue(testThreadPool.awaitTermination(1, TimeUnit.NANOSECONDS)); + assertTrue(threadPoolExecutor.isShutdown()); + assertTrue(threadPoolExecutor.isTerminated()); + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + + public void testTargetIORateChangesWhenSubmittingMergeTasks() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + int mergesStillToSubmit = randomIntBetween(10, 100); + int mergesStillToComplete = mergesStillToSubmit; + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + Semaphore runMergeSemaphore = new Semaphore(0); + AtomicInteger submittedIOThrottledMergeTasks = new AtomicInteger(); + while (mergesStillToComplete > 0) { + if (mergesStillToSubmit > 0 && (threadPoolMergeExecutorService.getRunningMergeTasks().isEmpty() || randomBoolean())) { + // submit new merge task + MergeTask mergeTask = mock(MergeTask.class); + boolean supportsIOThrottling = randomBoolean(); + when(mergeTask.supportsIOThrottling()).thenReturn(supportsIOThrottling); + doAnswer(mock -> { + Schedule schedule = randomFrom(Schedule.values()); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + if (supportsIOThrottling) { + submittedIOThrottledMergeTasks.decrementAndGet(); + } + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + if (supportsIOThrottling) { + submittedIOThrottledMergeTasks.decrementAndGet(); + } + return null; + }).when(mergeTask).abort(); + long currentIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (supportsIOThrottling) { + submittedIOThrottledMergeTasks.incrementAndGet(); + } + long newIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + if (supportsIOThrottling) { + if (submittedIOThrottledMergeTasks.get() < threadPoolMergeExecutorService + .getConcurrentMergesFloorLimitForThrottling()) { + // assert the IO rate decreases, with a floor limit, when there are few merge tasks enqueued + assertThat(newIORate, either(is(MIN_IO_RATE.getBytes())).or(lessThan(currentIORate))); + } else if (submittedIOThrottledMergeTasks.get() > threadPoolMergeExecutorService + .getConcurrentMergesCeilLimitForThrottling()) { + // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued + assertThat(newIORate, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(currentIORate))); + } else { + // assert the IO rate does NOT change when there are a couple of merge tasks enqueued + assertThat(newIORate, equalTo(currentIORate)); + } + } else { + // assert the IO rate does not change, when the merge task doesn't support IO throttling + assertThat(newIORate, equalTo(currentIORate)); + } + mergesStillToSubmit--; + } else { + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + long completedMerges = threadPoolExecutor.getCompletedTaskCount(); + runMergeSemaphore.release(); + // await merge to finish + assertBusy(() -> assertThat(threadPoolExecutor.getCompletedTaskCount(), is(completedMerges + 1))); + mergesStillToComplete--; + } + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + + public void testIORateIsAdjustedForRunningMergeTasks() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 3); + int mergesStillToSubmit = randomIntBetween(1, 10); + int mergesStillToComplete = mergesStillToSubmit; + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + Semaphore runMergeSemaphore = new Semaphore(0); + Set currentlyRunningMergeTasksSet = ConcurrentCollections.newConcurrentSet(); + while (mergesStillToComplete > 0) { + if (mergesStillToSubmit > 0 && (currentlyRunningMergeTasksSet.isEmpty() || randomBoolean())) { + MergeTask mergeTask = mock(MergeTask.class); + // all tasks support IO throttling in this test case + when(mergeTask.supportsIOThrottling()).thenReturn(true); + doAnswer(mock -> { + Schedule schedule = randomFrom(Schedule.values()); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + currentlyRunningMergeTasksSet.add(mergeTask); + // wait to be signalled before completing + runMergeSemaphore.acquire(); + currentlyRunningMergeTasksSet.remove(mergeTask); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + return null; + }).when(mergeTask).abort(); + int activeMergeTasksCount = threadPoolExecutor.getActiveCount(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + long newIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + // all currently running merge tasks must be IO throttled + assertBusy(() -> { + // await new merge to start executing + if (activeMergeTasksCount < mergeExecutorThreadCount) { + assertThat(threadPoolExecutor.getActiveCount(), is(activeMergeTasksCount + 1)); + } + // assert IO throttle is set on the running merge tasks + for (MergeTask currentlyRunningMergeTask : currentlyRunningMergeTasksSet) { + var ioRateCaptor = ArgumentCaptor.forClass(Long.class); + // only interested in the last invocation + verify(currentlyRunningMergeTask, atLeastOnce()).setIORateLimit(ioRateCaptor.capture()); + assertThat(ioRateCaptor.getValue(), is(newIORate)); + } + }); + mergesStillToSubmit--; + } else { + long completedMerges = threadPoolExecutor.getCompletedTaskCount(); + runMergeSemaphore.release(); + // await merge to finish + assertBusy(() -> assertThat(threadPoolExecutor.getCompletedTaskCount(), is(completedMerges + 1))); + mergesStillToComplete--; + } + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSpeedy() { + // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted + int submittedVsExecutedRateOutOf1000 = randomIntBetween(0, 250); + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); + // executor starts running merges only after a considerable amount of merge tasks have already been submitted + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); + } + + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSluggish() { + // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted + int submittedVsExecutedRateOutOf1000 = randomIntBetween(750, 1000); + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); + // executor starts running merges only after a considerable amount of merge tasks have already been submitted + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); + } + + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsOnPar() { + // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted + int submittedVsExecutedRateOutOf1000 = randomIntBetween(250, 750); + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); + // executor starts running merges only after a considerable amount of merge tasks have already been submitted + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); + } + + private void testIORateAdjustedForSubmittedTasks( + int totalTasksToSubmit, + int submittedVsExecutedRateOutOf1000, + int initialTasksToSubmit + ) { + DeterministicTaskQueue mergeExecutorTaskQueue = new DeterministicTaskQueue(); + ThreadPool mergeExecutorThreadPool = mergeExecutorTaskQueue.getThreadPool(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(mergeExecutorThreadPool); + final AtomicInteger currentlySubmittedMergeTaskCount = new AtomicInteger(); + final AtomicLong targetIORateLimit = new AtomicLong(ThreadPoolMergeExecutorService.START_IO_RATE.getBytes()); + final AtomicReference lastRunTask = new AtomicReference<>(); + int initialTasksCounter = Math.min(initialTasksToSubmit, totalTasksToSubmit); + while (totalTasksToSubmit > 0 || mergeExecutorTaskQueue.hasAnyTasks()) { + if (mergeExecutorTaskQueue.hasAnyTasks() == false // always submit if there are no outstanding merge tasks + || initialTasksCounter > 0 // first submit all the initial tasks + || (randomIntBetween(0, 1000) < submittedVsExecutedRateOutOf1000 && totalTasksToSubmit > 0)) { + // submit new merge task + MergeTask mergeTask = mock(MergeTask.class); + // all merge tasks support IO throttling in this test + when(mergeTask.supportsIOThrottling()).thenReturn(true); + // always run the task + when(mergeTask.schedule()).thenReturn(RUN); + doAnswer(mock -> { + lastRunTask.set(mergeTask); + return null; + }).when(mergeTask).run(); + currentlySubmittedMergeTaskCount.incrementAndGet(); + totalTasksToSubmit--; + initialTasksCounter--; + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + long newTargetIORateLimit = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + if (currentlySubmittedMergeTaskCount.get() < threadPoolMergeExecutorService.getConcurrentMergesFloorLimitForThrottling()) { + // assert the IO rate decreases, with a floor limit, when there are few merge tasks enqueued + assertThat(newTargetIORateLimit, either(is(MIN_IO_RATE.getBytes())).or(lessThan(targetIORateLimit.get()))); + } else if (currentlySubmittedMergeTaskCount.get() > threadPoolMergeExecutorService + .getConcurrentMergesCeilLimitForThrottling()) { + // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued + assertThat(newTargetIORateLimit, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(targetIORateLimit.get()))); + } else { + // assert the IO rate does change, when there are a couple of merge tasks enqueued + assertThat(newTargetIORateLimit, equalTo(targetIORateLimit.get())); + } + targetIORateLimit.set(newTargetIORateLimit); + } else { + // execute already submitted merge task + if (runOneTask(mergeExecutorTaskQueue)) { + // task is done, no longer just submitted + currentlySubmittedMergeTaskCount.decrementAndGet(); + // assert IO rate is invoked on the merge task that just ran + assertNotNull(lastRunTask.get()); + var ioRateCaptor = ArgumentCaptor.forClass(Long.class); + verify(lastRunTask.get(), times(1)).setIORateLimit(ioRateCaptor.capture()); + assertThat(ioRateCaptor.getValue(), is(targetIORateLimit.get())); + lastRunTask.set(null); + } + } + } + assertTrue(threadPoolMergeExecutorService.allDone()); + } + + public void testMergeTasksRunConcurrently() throws Exception { + // at least 2 merges allowed to run concurrently + int mergeExecutorThreadCount = randomIntBetween(2, 5); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + // more merge tasks than max concurrent merges allowed to run concurrently + int totalMergeTasksCount = mergeExecutorThreadCount + randomIntBetween(1, 5); + Semaphore runMergeSemaphore = new Semaphore(0); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + // submit all merge tasks + for (int i = 0; i < totalMergeTasksCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + doAnswer(mock -> { + // each individual merge task can either "run" or be "backlogged" + Schedule schedule = randomFrom(RUN, BACKLOG); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + fail("This test doesn't deal with aborted merge tasks"); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + // assert stats while merge tasks finish + for (int completedTasksCount = 0; completedTasksCount < totalMergeTasksCount + - mergeExecutorThreadCount; completedTasksCount++) { + int finalCompletedTasksCount = completedTasksCount; + assertBusy(() -> { + // assert that there are merge tasks running concurrently at the max allowed concurrency rate + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + // with the other merge tasks enqueued + assertThat( + threadPoolMergeExecutorService.getQueuedMergeTasks().size(), + is(totalMergeTasksCount - mergeExecutorThreadCount - finalCompletedTasksCount) + ); + // also check thread-pool stats for the same + assertThat(threadPoolExecutor.getActiveCount(), is(mergeExecutorThreadCount)); + assertThat( + threadPoolExecutor.getQueue().size(), + is(totalMergeTasksCount - mergeExecutorThreadCount - finalCompletedTasksCount) + ); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + // there are now fewer merge tasks still running than available threads + for (int remainingMergeTasksCount = mergeExecutorThreadCount; remainingMergeTasksCount >= 0; remainingMergeTasksCount--) { + int finalRemainingMergeTasksCount = remainingMergeTasksCount; + assertBusy(() -> { + // there are fewer available merges than available threads + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(finalRemainingMergeTasksCount)); + // no more merges enqueued + assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + // also check thread-pool stats for the same + assertThat(threadPoolExecutor.getActiveCount(), is(finalRemainingMergeTasksCount)); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + + public void testThreadPoolStatsWithBackloggedMergeTasks() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + int totalMergeTasksCount = randomIntBetween(1, 10); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + List backloggedMergeTasksList = new ArrayList<>(); + for (int i = 0; i < totalMergeTasksCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + boolean runNowOrBacklog = randomBoolean(); + if (runNowOrBacklog) { + when(mergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + } else { + // first backlog, then run + when(mergeTask.schedule()).thenReturn(BACKLOG, randomFrom(RUN, ABORT)); + backloggedMergeTasksList.add(mergeTask); + } + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + assertBusy(() -> { + // all runnable merge tasks should show as "completed" + assertThat(threadPoolExecutor.getCompletedTaskCount(), is((long) (totalMergeTasksCount - backloggedMergeTasksList.size()))); + if (backloggedMergeTasksList.size() >= mergeExecutorThreadCount) { + // active tasks waiting for backlogged merge tasks to be re-enqueued + assertThat(threadPoolExecutor.getActiveCount(), is(mergeExecutorThreadCount)); + assertThat(threadPoolExecutor.getQueue().size(), is(backloggedMergeTasksList.size() - mergeExecutorThreadCount)); + } else { + assertThat(threadPoolExecutor.getActiveCount(), is(backloggedMergeTasksList.size())); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + } + assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + }); + // re-enqueue backlogged merge tasks + for (MergeTask backloggedMergeTask : backloggedMergeTasksList) { + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backloggedMergeTask); + } + assertBusy(() -> { + // all merge tasks should now show as "completed" + assertThat(threadPoolExecutor.getCompletedTaskCount(), is((long) totalMergeTasksCount)); + assertThat(threadPoolExecutor.getActiveCount(), is(0)); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + } + + public void testBackloggedMergeTasksExecuteExactlyOnce() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + // few merge threads, in order to increase contention + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + // many merge tasks concurrently + int mergeTaskCount = randomIntBetween(10, 100); + CountDownLatch mergeTasksReadyLatch = new CountDownLatch(mergeTaskCount); + CountDownLatch submitTaskLatch = new CountDownLatch(1); + Collection runMergeTasks = ConcurrentCollections.newConcurrentSet(); + Collection abortMergeTasks = ConcurrentCollections.newConcurrentSet(); + for (int i = 0; i < mergeTaskCount; i++) { + new Thread(() -> { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + doAnswer(mock -> { + // each individual merge task can either "run" or be "backlogged" + Schedule schedule = randomFrom(RUN, ABORT, BACKLOG); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + if (schedule == RUN) { + runMergeTasks.add(mergeTask); + } + if (schedule == ABORT) { + abortMergeTasks.add(mergeTask); + } + return schedule; + }).when(mergeTask).schedule(); + mergeTasksReadyLatch.countDown(); + // make all threads submit merge tasks at once + safeAwait(submitTaskLatch); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + }).start(); + } + safeAwait(mergeTasksReadyLatch); + submitTaskLatch.countDown(); + assertBusy(() -> { + assertThat(runMergeTasks.size() + abortMergeTasks.size(), is(mergeTaskCount)); + for (MergeTask mergeTask : runMergeTasks) { + verify(mergeTask, times(1)).run(); + verify(mergeTask, times(0)).abort(); + if (mergeTask.supportsIOThrottling() == false) { + verify(mergeTask, times(0)).setIORateLimit(anyLong()); + } + } + for (MergeTask mergeTask : abortMergeTasks) { + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(1)).abort(); + verify(mergeTask, times(0)).setIORateLimit(anyLong()); + } + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + } + + public void testMergeTasksExecuteInSizeOrder() { + DeterministicTaskQueue mergeExecutorTaskQueue = new DeterministicTaskQueue(); + ThreadPool mergeExecutorThreadPool = mergeExecutorTaskQueue.getThreadPool(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(mergeExecutorThreadPool); + DeterministicTaskQueue reEnqueueBackloggedTaskQueue = new DeterministicTaskQueue(); + int mergeTaskCount = randomIntBetween(10, 100); + // sort merge tasks available to run by size + PriorityQueue mergeTasksAvailableToRun = new PriorityQueue<>( + mergeTaskCount, + Comparator.comparingLong(MergeTask::estimatedMergeSize) + ); + for (int i = 0; i < mergeTaskCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + // merge tasks of various sizes (0 might be a valid value) + when(mergeTask.estimatedMergeSize()).thenReturn(randomLongBetween(0, 10)); + doAnswer(mock -> { + // each individual merge task can either "run" or be "backlogged" at any point in time + Schedule schedule = randomFrom(Schedule.values()); + // in either case, the merge task is, at least temporarily, not "available" to run + mergeTasksAvailableToRun.remove(mergeTask); + // if merge task cannot run, it is backlogged, and should be re enqueued some time in the future + if (schedule == BACKLOG) { + // reenqueue backlogged merge task sometime in the future + reEnqueueBackloggedTaskQueue.scheduleNow(() -> { + // reenqueue backlogged merge task sometime in the future + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + // the merge task should once again be "available" to run + mergeTasksAvailableToRun.add(mergeTask); + }); + } + // hack: avoid blocking for unavailable merge task by running one re-enqueuing task now + if (schedule == BACKLOG && mergeTasksAvailableToRun.isEmpty()) { + assertTrue(runOneTask(reEnqueueBackloggedTaskQueue)); + } + if (schedule == RUN && mergeTasksAvailableToRun.isEmpty() == false) { + // assert the merge task that's now going to run is the smallest of the ones currently available to run + assertThat(mergeTask.estimatedMergeSize(), lessThanOrEqualTo(mergeTasksAvailableToRun.peek().estimatedMergeSize())); + } + return schedule; + }).when(mergeTask).schedule(); + mergeTasksAvailableToRun.add(mergeTask); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + while (true) { + // re-enqueue merge tasks + if (mergeTasksAvailableToRun.isEmpty() || randomBoolean()) { + boolean backlogReEnqueued = runOneTask(reEnqueueBackloggedTaskQueue); + if (mergeTasksAvailableToRun.isEmpty() && backlogReEnqueued == false) { + // test complete, all merges ran, and none is backlogged + assertFalse(mergeExecutorTaskQueue.hasAnyTasks()); + assertFalse(reEnqueueBackloggedTaskQueue.hasAnyTasks()); + assertTrue(threadPoolMergeExecutorService.allDone()); + break; + } + } else { + // run one merge task + runOneTask(mergeExecutorTaskQueue); + } + } + } + + static ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService(ThreadPool threadPool) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + threadPool, + randomBoolean() + ? Settings.EMPTY + : Settings.builder().put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true).build() + ); + assertNotNull(threadPoolMergeExecutorService); + assertTrue(threadPoolMergeExecutorService.allDone()); + return threadPoolMergeExecutorService; + } + + private static boolean runOneTask(DeterministicTaskQueue deterministicTaskQueue) { + while (deterministicTaskQueue.hasAnyTasks()) { + if (deterministicTaskQueue.hasRunnableTasks()) { + deterministicTaskQueue.runRandomTask(); + return true; + } else { + deterministicTaskQueue.advanceTime(); + } + } + return false; + } +} diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java new file mode 100644 index 0000000000000..5e085c083b785 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -0,0 +1,496 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergePolicy.OneMerge; +import org.apache.lucene.index.MergeScheduler.MergeSource; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.store.MergeInfo; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.MergeSchedulerConfig; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.mockito.ArgumentCaptor; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Semaphore; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; +import static org.mockito.Mockito.when; + +public class ThreadPoolMergeSchedulerTests extends ESTestCase { + + public void testMergesExecuteInSizeOrder() throws IOException { + DeterministicTaskQueue threadPoolTaskQueue = new DeterministicTaskQueue(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(threadPoolTaskQueue.getThreadPool()); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", Settings.EMPTY), + threadPoolMergeExecutorService + ) + ) { + List executedMergesList = new ArrayList<>(); + int mergeCount = randomIntBetween(2, 10); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + doAnswer(invocation -> { + OneMerge merge = (OneMerge) invocation.getArguments()[0]; + assertFalse(merge.isAborted()); + executedMergesList.add(merge); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + } + threadPoolTaskQueue.runAllTasks(); + assertThat(executedMergesList.size(), is(mergeCount)); + // assert merges are executed in ascending size order + for (int i = 1; i < mergeCount; i++) { + assertThat( + executedMergesList.get(i - 1).getStoreMergeInfo().estimatedMergeBytes(), + lessThanOrEqualTo(executedMergesList.get(i).getStoreMergeInfo().estimatedMergeBytes()) + ); + } + } + assertTrue(threadPoolMergeExecutorService.allDone()); + } + + public void testSimpleMergeTaskBacklogging() { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + // close method waits for running merges to finish, but this test leaves running merges around + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService + ); + // more merge tasks than merge threads + int mergeCount = mergeExecutorThreadCount + randomIntBetween(1, 5); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + Schedule schedule = threadPoolMergeScheduler.schedule( + threadPoolMergeScheduler.newMergeTask(mergeSource, oneMerge, randomFrom(MergeTrigger.values())) + ); + if (i < mergeExecutorThreadCount) { + assertThat(schedule, is(Schedule.RUN)); + } else { + assertThat(schedule, is(Schedule.BACKLOG)); + } + } + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(mergeCount - mergeExecutorThreadCount)); + } + + public void testSimpleMergeTaskReEnqueueingBySize() { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + // close method waits for running merges to finish, but this test leaves running merges around + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService + ); + // sort backlogged merges by size + PriorityQueue backloggedMergeTasks = new PriorityQueue<>(16, Comparator.comparingLong(MergeTask::estimatedMergeSize)); + // more merge tasks than merge threads + int mergeCount = mergeExecutorThreadCount + randomIntBetween(2, 10); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + MergeTask mergeTask = threadPoolMergeScheduler.newMergeTask(mergeSource, oneMerge, randomFrom(MergeTrigger.values())); + Schedule schedule = threadPoolMergeScheduler.schedule(mergeTask); + if (i < mergeExecutorThreadCount) { + assertThat(schedule, is(Schedule.RUN)); + } else { + assertThat(schedule, is(Schedule.BACKLOG)); + backloggedMergeTasks.add(mergeTask); + } + } + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(backloggedMergeTasks.size())); + int enqueuedTasksCount = mergeCount - mergeExecutorThreadCount; + for (int i = 0; i < enqueuedTasksCount; i++) { + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(enqueuedTasksCount - i)); + MergeTask runningMergeTask = randomFrom(threadPoolMergeScheduler.getRunningMergeTasks().values()); + runningMergeTask.run(); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService, times(i + 1)).reEnqueueBackloggedMergeTask(submittedMergeTaskCaptor.capture()); + assertThat(submittedMergeTaskCaptor.getValue(), is(backloggedMergeTasks.poll())); + Schedule schedule = threadPoolMergeScheduler.schedule(submittedMergeTaskCaptor.getValue()); + assertThat(schedule, is(Schedule.RUN)); + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + } + } + + public void testMergeSourceWithFollowUpMergesRunSequentially() throws Exception { + // test with min 2 allowed concurrent merges + int mergeExecutorThreadCount = randomIntBetween(2, 5); + Settings settings = Settings.builder() + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", settings), + threadPoolMergeExecutorService + ) + ) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge firstMerge = mock(OneMerge.class); + when(firstMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(firstMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + // at least one followup merge + null (i.e. no more followups) + int followUpMergeCount = randomIntBetween(2, 10); + OneMerge[] followUpMerges = new OneMerge[followUpMergeCount]; + followUpMerges[followUpMergeCount - 1] = null; + for (int i = 0; i < followUpMergeCount - 1; i++) { + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + followUpMerges[i] = oneMerge; + } + // the merge source with follow-up merges + when(mergeSource.getNextMerge()).thenReturn(firstMerge, followUpMerges); + AtomicBoolean isMergeInProgress = new AtomicBoolean(); + AtomicInteger runMergeIdx = new AtomicInteger(); + Semaphore runMergeSemaphore = new Semaphore(0); + Semaphore nextMergeSemaphore = new Semaphore(0); + doAnswer(invocation -> { + // assert only one merge can be in-progress at any point-in-time + assertTrue(isMergeInProgress.compareAndSet(false, true)); + OneMerge mergeInvocation = (OneMerge) invocation.getArguments()[0]; + assertFalse(mergeInvocation.isAborted()); + // assert merges run in the order they are produced by the merge source + if (runMergeIdx.get() == 0) { + assertThat(mergeInvocation, is(firstMerge)); + } else { + assertThat(mergeInvocation, is(followUpMerges[runMergeIdx.get() - 1])); + } + runMergeIdx.incrementAndGet(); + // await before returning from the merge in order to really ensure that follow-up merges don't run concurrently + nextMergeSemaphore.release(); + runMergeSemaphore.acquire(); + assertTrue(isMergeInProgress.compareAndSet(true, false)); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + // trigger run merges on the merge source + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + do { + // let merges run, but wait for the in-progress one to signal it is running + nextMergeSemaphore.acquire(); + runMergeSemaphore.release(); + } while (runMergeIdx.get() < followUpMergeCount); + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + } + + public void testMergesRunConcurrently() throws Exception { + // min 2 allowed concurrent merges, per scheduler + int mergeSchedulerMaxThreadCount = randomIntBetween(2, 4); + // the merge executor has at least 1 extra thread available + int mergeExecutorThreadCount = mergeSchedulerMaxThreadCount + randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeSchedulerMaxThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", settings), + threadPoolMergeExecutorService + ) + ) { + // at least 1 extra merge than there are concurrently allowed + int mergeCount = mergeExecutorThreadCount + randomIntBetween(1, 10); + Semaphore runMergeSemaphore = new Semaphore(0); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + doAnswer(invocation -> { + OneMerge merge = (OneMerge) invocation.getArguments()[0]; + assertFalse(merge.isAborted()); + // wait to be signalled before completing + runMergeSemaphore.acquire(); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + } + for (int completedMergesCount = 0; completedMergesCount < mergeCount + - mergeSchedulerMaxThreadCount; completedMergesCount++) { + int finalCompletedMergesCount = completedMergesCount; + assertBusy(() -> { + // assert that there are merges running concurrently at the max allowed concurrency rate + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeSchedulerMaxThreadCount)); + // with the other merges backlogged + assertThat( + threadPoolMergeScheduler.getBackloggedMergeTasks().size(), + is(mergeCount - mergeSchedulerMaxThreadCount - finalCompletedMergesCount) + ); + // also check the same for the thread-pool executor + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(mergeSchedulerMaxThreadCount)); + // queued merge tasks do not include backlogged merges + assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + // also check thread-pool stats for the same + // there are active thread-pool threads waiting for the backlogged merge tasks to be re-enqueued + int activeMergeThreads = Math.min(mergeCount - finalCompletedMergesCount, mergeExecutorThreadCount); + assertThat(threadPoolExecutor.getActiveCount(), is(activeMergeThreads)); + assertThat(threadPoolExecutor.getQueue().size(), is(mergeCount - finalCompletedMergesCount - activeMergeThreads)); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + // there are now fewer merges still running than available threads + for (int remainingMergesCount = mergeSchedulerMaxThreadCount; remainingMergesCount >= 0; remainingMergesCount--) { + int finalRemainingMergesCount = remainingMergesCount; + assertBusy(() -> { + // there are fewer available merges than available threads + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(finalRemainingMergesCount)); + // no more backlogged merges + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(0)); + // also check thread-pool executor for the same + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(finalRemainingMergesCount)); + // no more backlogged merges + assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + // also check thread-pool stats for the same + assertThat(threadPoolExecutor.getActiveCount(), is(finalRemainingMergesCount)); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + } + + public void testSchedulerCloseWaitsForRunningMerge() throws Exception { + int mergeSchedulerMaxThreadCount = randomIntBetween(1, 3); + int mergeExecutorThreadCount = randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeSchedulerMaxThreadCount) + .build(); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(testThreadPool); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", settings), + threadPoolMergeExecutorService + ) + ) { + CountDownLatch mergeDoneLatch = new CountDownLatch(1); + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + doAnswer(invocation -> { + OneMerge merge = (OneMerge) invocation.getArguments()[0]; + assertFalse(merge.isAborted()); + // wait to be signalled before completing the merge + mergeDoneLatch.await(); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + Thread t = new Thread(() -> { + try { + threadPoolMergeScheduler.close(); + } catch (IOException e) { + fail(e); + } + }); + t.start(); + try { + assertTrue(t.isAlive()); + // ensure the merge scheduler is effectively "closed" + assertBusy(() -> { + MergeSource mergeSource2 = mock(MergeSource.class); + threadPoolMergeScheduler.merge(mergeSource2, randomFrom(MergeTrigger.values())); + // when the merge scheduler is closed it won't pull in any new merges from the merge source + verifyNoInteractions(mergeSource2); + }); + // assert the merge still shows up as "running" + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().keySet(), contains(oneMerge)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(0)); + assertTrue(t.isAlive()); + // signal the merge to finish + mergeDoneLatch.countDown(); + } finally { + t.join(); + } + assertBusy(() -> { + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(0)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(0)); + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + } + } + + public void testAutoIOThrottleForMergeTasksWhenSchedulerDisablesIt() throws Exception { + // merge scheduler configured with auto IO throttle disabled + Settings settings = Settings.builder().put(MergeSchedulerConfig.AUTO_THROTTLE_SETTING.getKey(), false).build(); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + MergePolicy.OneMergeProgress oneMergeProgress = new MergePolicy.OneMergeProgress(); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomNonNegativeLong())); + when(oneMerge.getMergeProgress()).thenReturn(oneMergeProgress); + MergeSource mergeSource = mock(MergeSource.class); + when(mergeSource.getNextMerge()).thenReturn(oneMerge); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService + ) + ) { + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + assertFalse(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + } + + public void testAutoIOThrottleForMergeTasks() throws Exception { + final Settings.Builder settingsBuilder = Settings.builder(); + // merge scheduler configured with auto IO throttle enabled + if (randomBoolean()) { + settingsBuilder.put(MergeSchedulerConfig.AUTO_THROTTLE_SETTING.getKey(), true); + } + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settingsBuilder.build()); + MergePolicy.OneMergeProgress oneMergeProgress = new MergePolicy.OneMergeProgress(); + OneMerge oneMerge = mock(OneMerge.class); + // forced merge with a set number of segments + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomNonNegativeLong(), randomNonNegativeInt())); + when(oneMerge.getMergeProgress()).thenReturn(oneMergeProgress); + MergeSource mergeSource = mock(MergeSource.class); + when(mergeSource.getNextMerge()).thenReturn(oneMerge); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService + ) + ) { + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + // forced merge tasks should not be IO throttled + assertFalse(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + // NOT a forced merge + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomNonNegativeLong(), -1)); + threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService + ) + ) { + // merge submitted upon closing + threadPoolMergeScheduler.merge(mergeSource, MergeTrigger.CLOSING); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + // merge tasks submitted when closing should not be IO throttled + assertFalse(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + // otherwise, merge tasks should be auto IO throttled + threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService + ) + ) { + // merge submitted upon closing + threadPoolMergeScheduler.merge( + mergeSource, + randomValueOtherThan(MergeTrigger.CLOSING, () -> randomFrom(MergeTrigger.values())) + ); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + // merge tasks should be auto IO throttled + assertTrue(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + } + + private static MergeInfo getNewMergeInfo(long estimatedMergeBytes) { + return getNewMergeInfo(estimatedMergeBytes, randomFrom(-1, randomNonNegativeInt())); + } + + private static MergeInfo getNewMergeInfo(long estimatedMergeBytes, int maxNumSegments) { + return new MergeInfo(randomNonNegativeInt(), estimatedMergeBytes, randomBoolean(), maxNumSegments); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java index 8c325c945a7a2..38d89f08378bd 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java @@ -41,8 +41,8 @@ public class IndexShardRetentionLeaseTests extends IndexShardTestCase { private final AtomicLong currentTimeMillis = new AtomicLong(); @Override - protected ThreadPool setUpThreadPool() { - return new TestThreadPool(getClass().getName(), threadPoolSettings()) { + protected ThreadPool setUpThreadPool(Settings settings) { + return new TestThreadPool(getClass().getName(), settings) { @Override public long absoluteTimeInMillis() { return currentTimeMillis.get(); diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java index 4549a329d499a..43f5d58ee8e0c 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java @@ -5019,6 +5019,7 @@ public void testCloseShardWhileEngineIsWarming() throws Exception { EngineConfig configWithWarmer = new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), warmer, config.getStore(), diff --git a/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java b/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java index ca616dc619ec9..4e280f5443787 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java @@ -41,6 +41,8 @@ import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngine; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.LuceneDocument; import org.elasticsearch.index.mapper.MapperService; @@ -89,6 +91,7 @@ public class RefreshListenersTests extends ESTestCase { private Engine engine; private volatile int maxListeners; private ThreadPool threadPool; + private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private Store store; @Before @@ -97,6 +100,11 @@ public void setupListeners() throws Exception { maxListeners = randomIntBetween(2, 1000); // Now setup the InternalEngine which is much more complicated because we aren't mocking anything threadPool = new TestThreadPool(getTestName()); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) + .build(); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); listeners = new RefreshListeners( () -> maxListeners, () -> engine.refresh("too-many-listeners"), @@ -105,7 +113,6 @@ public void setupListeners() throws Exception { new MeanMetric() ); - IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY); ShardId shardId = new ShardId(new Index("index", "_na_"), 1); Directory directory = newDirectory(); store = new Store(shardId, indexSettings, directory, new DummyShardLock(shardId)); @@ -134,6 +141,7 @@ public void onFailedEngine(String reason, @Nullable Exception e) { EngineConfig config = new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, indexSettings, null, store, diff --git a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java index 7a2f375001874..d4554df1617ee 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java @@ -155,6 +155,7 @@ public abstract class EngineTestCase extends ESTestCase { protected static final IndexSettings INDEX_SETTINGS = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY); protected ThreadPool threadPool; + protected ThreadPoolMergeExecutorService threadPoolMergeExecutorService; protected TranslogHandler translogHandler; protected Store store; @@ -197,6 +198,7 @@ protected Settings indexSettings() { between(10, 10 * IndexSettings.MAX_REFRESH_LISTENERS_PER_SHARD.get(Settings.EMPTY)) ) .put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), between(0, 1000)) + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) .build(); } @@ -241,6 +243,11 @@ public void setUp() throws Exception { } defaultSettings = IndexSettingsModule.newIndexSettings("index", indexSettings()); threadPool = new TestThreadPool(getClass().getName()); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + defaultSettings.getNodeSettings() + ); + store = createStore(); storeReplica = createStore(); Lucene.cleanLuceneIndex(store.directory()); @@ -272,6 +279,7 @@ public static EngineConfig copy(EngineConfig config, LongSupplier globalCheckpoi return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), config.getWarmer(), config.getStore(), @@ -304,6 +312,7 @@ public EngineConfig copy(EngineConfig config, Analyzer analyzer) { return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), config.getWarmer(), config.getStore(), @@ -336,6 +345,7 @@ public EngineConfig copy(EngineConfig config, MergePolicy mergePolicy) { return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), config.getWarmer(), config.getStore(), @@ -840,6 +850,7 @@ public EngineConfig config( return new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, indexSettings, null, store, @@ -880,6 +891,7 @@ protected EngineConfig config(EngineConfig config, Store store, Path translogPat return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), indexSettings, config.getWarmer(), store, diff --git a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java index 2ae4bb0343101..e8286835e9cfa 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java @@ -54,6 +54,8 @@ import org.elasticsearch.index.engine.EngineFactory; import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.mapper.MapperMetrics; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.SourceToParse; @@ -152,6 +154,7 @@ public void onRecoveryFailure(RecoveryFailedException e, boolean sendShardFailur }; protected ThreadPool threadPool; + protected ThreadPoolMergeExecutorService threadPoolMergeExecutorService; protected Executor writeExecutor; protected long primaryTerm; @@ -166,14 +169,16 @@ public static void addMockCloseImplementation(IndexShard shard) throws IOExcepti @Override public void setUp() throws Exception { super.setUp(); - threadPool = setUpThreadPool(); + Settings settings = threadPoolSettings(); + threadPool = setUpThreadPool(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); writeExecutor = threadPool.executor(ThreadPool.Names.WRITE); primaryTerm = randomIntBetween(1, 100); // use random but fixed term for creating shards failOnShardFailures(); } - protected ThreadPool setUpThreadPool() { - return new TestThreadPool(getClass().getName(), threadPoolSettings()); + protected ThreadPool setUpThreadPool(Settings settings) { + return new TestThreadPool(getClass().getName(), settings); } @Override @@ -203,7 +208,7 @@ protected void failOnShardFailures() { } public Settings threadPoolSettings() { - return Settings.EMPTY; + return Settings.builder().put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()).build(); } protected Store createStore(IndexSettings indexSettings, ShardPath shardPath) throws IOException { @@ -537,6 +542,7 @@ protected IndexShard newShard( indexEventListener, indexReaderWrapper, threadPool, + threadPoolMergeExecutorService, BigArrays.NON_RECYCLING_INSTANCE, warmer, Collections.emptyList(), diff --git a/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java index 0c34b0fddc5c8..9be119a4cbf35 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java @@ -131,6 +131,7 @@ import org.elasticsearch.index.MockEngineFactoryPlugin; import org.elasticsearch.index.codec.CodecService; import org.elasticsearch.index.engine.Segment; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.mapper.MockFieldFilterPlugin; import org.elasticsearch.index.translog.Translog; import org.elasticsearch.indices.IndicesQueryCache; @@ -1608,12 +1609,39 @@ protected final BroadcastResponse flush(String... indices) { * Waits for all relocations and force merge all indices in the cluster to 1 segment. */ protected BroadcastResponse forceMerge() { + return forceMerge(randomBoolean()); + } + + /** + * Waits for all relocations and force merge all indices in the cluster to 1 segment. + */ + protected BroadcastResponse forceMerge(boolean assertOneSegment) { waitForRelocation(); BroadcastResponse actionGet = indicesAdmin().prepareForceMerge().setMaxNumSegments(1).get(); assertNoFailures(actionGet); + if (assertOneSegment) { + // after a force merge there should only be 1 segment per shard + var shardsWithMultipleSegments = getShardSegments().stream() + .filter(shardSegments -> shardSegments.getSegments().size() > 1) + .toList(); + assertTrue("there are shards with multiple segments " + shardsWithMultipleSegments, shardsWithMultipleSegments.isEmpty()); + } return actionGet; } + /** + * Returns the segments of the shards of the indices. + */ + protected List getShardSegments(String... indices) { + IndicesSegmentResponse indicesSegmentResponse = indicesAdmin().prepareSegments(indices).get(); + return indicesSegmentResponse.getIndices() + .values() + .stream() + .flatMap(indexSegments -> indexSegments.getShards().values().stream()) + .flatMap(indexShardSegments -> Stream.of(indexShardSegments.shards())) + .toList(); + } + /** * Returns true iff the given index exists otherwise false */ @@ -2057,6 +2085,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { builder.put(IndexingPressure.SPLIT_BULK_HIGH_WATERMARK.getKey(), randomFrom("1KB", "16KB", "64KB")); builder.put(IndexingPressure.SPLIT_BULK_HIGH_WATERMARK_SIZE.getKey(), "256B"); } + builder.put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()); return builder.build(); } diff --git a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java index ae79636c6b14c..07467fa69e7bf 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java @@ -1418,6 +1418,20 @@ public void assertConsistentHistoryBetweenTranslogAndLuceneIndex() throws IOExce } } + public void assertMergeExecutorIsDone() throws Exception { + assertBusy(() -> { + for (String nodeName : getNodeNames()) { + IndicesService indicesService = getInstance(IndicesService.class, nodeName); + if (indicesService.getThreadPoolMergeExecutorService() != null) { + assertTrue( + "thread pool merge executor is not done after test", + indicesService.getThreadPoolMergeExecutorService().allDone() + ); + } + } + }); + } + public void assertNoInFlightDocsInEngine() throws Exception { assertBusy(() -> { for (String nodeName : getNodeNames()) { @@ -2526,6 +2540,7 @@ public synchronized void assertAfterTest() throws Exception { assertRequestsFinished(); assertSearchContextsReleased(); assertNoInFlightDocsInEngine(); + assertMergeExecutorIsDone(); awaitIndexShardCloseAsyncTasks(); for (NodeAndClient nodeAndClient : nodes.values()) { NodeEnvironment env = nodeAndClient.node().getNodeEnvironment(); diff --git a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java index 62dc3313a1172..957570918cde3 100644 --- a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java +++ b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java @@ -34,6 +34,8 @@ import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngine; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.engine.TranslogHandler; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.MapperService; @@ -82,6 +84,7 @@ public class FollowingEngineTests extends ESTestCase { private ThreadPool threadPool; + private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private Index index; private ShardId shardId; private AtomicLong primaryTerm = new AtomicLong(); @@ -91,7 +94,11 @@ public class FollowingEngineTests extends ESTestCase { @Override public void setUp() throws Exception { super.setUp(); - threadPool = new TestThreadPool("following-engine-tests"); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) + .build(); + threadPool = new TestThreadPool("following-engine-tests", settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); index = new Index("index", "uuid"); shardId = new ShardId(index, 0); primaryTerm.set(randomLongBetween(1, Long.MAX_VALUE)); @@ -113,7 +120,7 @@ public void testFollowingEngineRejectsNonFollowingIndex() throws IOException { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new FollowingEngine(engineConfig)); assertThat(e, hasToString(containsString("a following engine can not be constructed for a non-following index"))); } @@ -137,7 +144,7 @@ public void testOutOfOrderDocuments() throws IOException { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { final VersionType versionType = randomFrom(VersionType.INTERNAL, VersionType.EXTERNAL, VersionType.EXTERNAL_GTE); final List ops = EngineTestCase.generateSingleDocHistory(true, versionType, 2, 2, 20, "id"); @@ -156,7 +163,7 @@ public void runIndexTest( final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { final Engine.Index indexToTest = indexForFollowing("id", seqNo, origin); consumer.accept(followingEngine, indexToTest); @@ -182,7 +189,7 @@ public void runDeleteTest( final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { final String id = "id"; final Engine.Delete delete = new Engine.Delete( @@ -208,7 +215,7 @@ public void testDoNotFillSeqNoGaps() throws Exception { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { followingEngine.index(indexForFollowing("id", 128, Engine.Operation.Origin.PRIMARY)); int addedNoops = followingEngine.fillSeqNoGaps(primaryTerm.get()); @@ -221,6 +228,7 @@ private EngineConfig engineConfig( final ShardId shardIdValue, final IndexSettings indexSettings, final ThreadPool threadPool, + final ThreadPoolMergeExecutorService threadPoolMergeExecutorService, final Store store ) throws IOException { final IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); @@ -235,6 +243,7 @@ private EngineConfig engineConfig( return new EngineConfig( shardIdValue, threadPool, + threadPoolMergeExecutorService, indexSettings, null, store, @@ -506,7 +515,13 @@ public void testConcurrentIndexOperationsWithDeletesCanAdvanceMaxSeqNoOfUpdates( IndexMetadata followerIndexMetadata = IndexMetadata.builder(index.getName()).settings(followerSettings).build(); IndexSettings followerIndexSettings = new IndexSettings(followerIndexMetadata, Settings.EMPTY); try (Store followerStore = createStore(shardId, followerIndexSettings, newDirectory())) { - EngineConfig followerConfig = engineConfig(shardId, followerIndexSettings, threadPool, followerStore); + EngineConfig followerConfig = engineConfig( + shardId, + followerIndexSettings, + threadPool, + threadPoolMergeExecutorService, + followerStore + ); followerStore.createEmpty(); String translogUuid = Translog.createEmptyTranslog( followerConfig.getTranslogConfig().getTranslogPath(), @@ -613,7 +628,7 @@ private void runFollowTest(CheckedBiConsumer operationWithTerms = new HashMap<>(); @@ -882,7 +903,7 @@ public void testMaxSeqNoInCommitUserData() throws Exception { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine engine = createEngine(store, engineConfig)) { AtomicBoolean running = new AtomicBoolean(true); Thread rollTranslog = new Thread(() -> { diff --git a/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java b/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java index c9a1a82b34118..ef7fd2c6b065d 100644 --- a/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java +++ b/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java @@ -57,7 +57,8 @@ public void testSearcherId() throws Exception { equalTo(0) ); refresh(indexName); - forceMerge(); + // force merge with expunge deletes is not merging down to one segment only + forceMerge(false); final String repositoryName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT); createRepository(repositoryName, "fs"); @@ -125,7 +126,8 @@ public void testRetryPointInTime() throws Exception { equalTo(0) ); refresh(indexName); - forceMerge(); + // force merge with expunge deletes is not merging down to one segment only + forceMerge(false); final String repositoryName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT); createRepository(repositoryName, "fs"); diff --git a/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java b/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java index 405a9926e2e5f..8e3bb1d3f27db 100644 --- a/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java +++ b/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java @@ -465,7 +465,7 @@ public void testPeerRecoveryTriesToUseMostOfTheDataFromAnAvailableSnapshot() thr int numDocs = randomIntBetween(300, 1000); indexDocs(indexName, 0, numDocs); - forceMerge(); + forceMerge(false); String repoName = "repo"; createRepo(repoName, TestRepositoryPlugin.INSTRUMENTED_TYPE); From a05d380381bb19d7c3cb203b1e6f9cb3a62aa3e3 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Mon, 31 Mar 2025 00:26:12 +0300 Subject: [PATCH 02/14] Start indexing throttling only after disk IO unthrottling does not keep up with the merge load (#125654) Fixes an issue where indexing throttling kicks in while disk IO is throttling. Instead disk IO should first unthrottle, and only then, if we still can't keep up with the merging load, start throttling indexing. Fixes elastic/elasticsearch-benchmarks#2437 Relates #120869 --- .../indices/stats/IndexStatsIT.java | 52 +++-- .../ThreadPoolMergeExecutorService.java | 4 + .../engine/ThreadPoolMergeScheduler.java | 7 +- .../engine/ThreadPoolMergeSchedulerTests.java | 204 ++++++++++++++++++ 4 files changed, 243 insertions(+), 24 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java index ab91d92927217..afd9e98b3e7d9 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java @@ -78,6 +78,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; @@ -465,7 +466,7 @@ public void testNonThrottleStats() throws Exception { assertThat(stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis(), equalTo(0L)); } - public void testThrottleStats() throws Exception { + public void testThrottleStats() { assertAcked( prepareCreate("test").setSettings( settingsBuilder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, "1") @@ -478,31 +479,38 @@ public void testThrottleStats() throws Exception { ) ); ensureGreen(); - long termUpto = 0; - IndicesStatsResponse stats; // make sure we see throttling kicking in: - boolean done = false; + AtomicBoolean done = new AtomicBoolean(); + AtomicLong termUpTo = new AtomicLong(); long start = System.currentTimeMillis(); - while (done == false) { - for (int i = 0; i < 100; i++) { - // Provoke slowish merging by making many unique terms: - StringBuilder sb = new StringBuilder(); - for (int j = 0; j < 100; j++) { - sb.append(' '); - sb.append(termUpto++); - } - prepareIndex("test").setId("" + termUpto).setSource("field" + (i % 10), sb.toString()).get(); - if (i % 2 == 0) { + for (int threadIdx = 0; threadIdx < 5; threadIdx++) { + int finalThreadIdx = threadIdx; + new Thread(() -> { + IndicesStatsResponse stats; + while (done.get() == false) { + for (int i = 0; i < 100; i++) { + // Provoke slowish merging by making many unique terms: + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < 100; j++) { + sb.append(' '); + sb.append(termUpTo.incrementAndGet()); + } + prepareIndex("test").setId("" + termUpTo.get()).setSource("field" + (i % 10), sb.toString()).get(); + if (i % 2 == 0) { + refresh(); + } + } refresh(); + if (finalThreadIdx == 0) { + stats = indicesAdmin().prepareStats().get(); + done.set(stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis() > 0); + } + if (System.currentTimeMillis() - start > 300 * 1000) { // Wait 5 minutes for throttling to kick in + done.set(true); + fail("index throttling didn't kick in after 5 minutes of intense merging"); + } } - } - refresh(); - stats = indicesAdmin().prepareStats().get(); - // nodesStats = clusterAdmin().prepareNodesStats().setIndices(true).get(); - done = stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis() > 0; - if (System.currentTimeMillis() - start > 300 * 1000) { // Wait 5 minutes for throttling to kick in - fail("index throttling didn't kick in after 5 minutes of intense merging"); - } + }).start(); } // Optimize & flush and wait; else we sometimes get a "Delete Index failed - not acked" diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java index 5217edb5490dc..7c78698ac6f66 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java @@ -272,6 +272,10 @@ interface UpdateConsumer { } } + public boolean usingMaxTargetIORateBytesPerSec() { + return MAX_IO_RATE.getBytes() == targetIORateBytesPerSec.get(); + } + // exposed for tests Set getRunningMergeTasks() { return runningMergeTasks; diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java index 8cfdc59268365..f645edaff64a8 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java @@ -50,7 +50,7 @@ public class ThreadPoolMergeScheduler extends MergeScheduler implements Elastics ); private final ShardId shardId; private final MergeSchedulerConfig config; - private final Logger logger; + protected final Logger logger; private final MergeTracking mergeTracking; private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final PriorityQueue backloggedMergeTasks = new PriorityQueue<>( @@ -191,7 +191,10 @@ private void checkMergeTaskThrottling() { int configuredMaxMergeCount = config.getMaxMergeCount(); // both currently running and enqueued merge tasks are considered "active" for throttling purposes int activeMerges = (int) (submittedMergesCount - doneMergesCount); - if (activeMerges > configuredMaxMergeCount && shouldThrottleIncomingMerges.get() == false) { + if (activeMerges > configuredMaxMergeCount + // only throttle indexing if disk IO is un-throttled, and we still can't keep up with the merge load + && threadPoolMergeExecutorService.usingMaxTargetIORateBytesPerSec() + && shouldThrottleIncomingMerges.get() == false) { // maybe enable merge task throttling synchronized (shouldThrottleIncomingMerges) { if (shouldThrottleIncomingMerges.getAndSet(true) == false) { diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java index 5e085c083b785..ae9168357eb32 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -173,6 +173,165 @@ public void testSimpleMergeTaskReEnqueueingBySize() { } } + public void testIndexingThrottlingWhenSubmittingMerges() { + final int maxThreadCount = randomIntBetween(1, 5); + // settings validation requires maxMergeCount >= maxThreadCount + final int maxMergeCount = maxThreadCount + randomIntBetween(0, 5); + List submittedMergeTasks = new ArrayList<>(); + AtomicBoolean isUsingMaxTargetIORate = new AtomicBoolean(false); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mockThreadPoolMergeExecutorService( + submittedMergeTasks, + isUsingMaxTargetIORate + ); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), maxThreadCount) + .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), maxMergeCount) + .build(); + TestThreadPoolMergeScheduler threadPoolMergeScheduler = new TestThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService + ); + // make sure there are more merges submitted than the max merge count limit (which triggers IO throttling) + int excessMerges = randomIntBetween(1, 10); + int mergesToSubmit = maxMergeCount + excessMerges; + boolean expectIndexThrottling = false; + int submittedMerges = 0; + // merges are submitted, while some are also scheduled (but none is run) + while (submittedMerges < mergesToSubmit - 1) { + isUsingMaxTargetIORate.set(randomBoolean()); + if (submittedMergeTasks.isEmpty() == false && randomBoolean()) { + // maybe schedule one submitted merge + MergeTask mergeTask = randomFrom(submittedMergeTasks); + submittedMergeTasks.remove(mergeTask); + mergeTask.schedule(); + } else { + // submit one merge + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + submittedMerges++; + if (isUsingMaxTargetIORate.get() && submittedMerges > maxMergeCount) { + expectIndexThrottling = true; + } else if (submittedMerges <= maxMergeCount) { + expectIndexThrottling = false; + } + } + // assert IO throttle state + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(expectIndexThrottling)); + } + // submit one last merge when IO throttling is at max value + isUsingMaxTargetIORate.set(true); + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + // assert index throttling because IO throttling is at max value + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(true)); + } + + public void testIndexingThrottlingWhileMergesAreRunning() { + final int maxThreadCount = randomIntBetween(1, 5); + // settings validation requires maxMergeCount >= maxThreadCount + final int maxMergeCount = maxThreadCount + randomIntBetween(0, 5); + List submittedMergeTasks = new ArrayList<>(); + List scheduledToRunMergeTasks = new ArrayList<>(); + AtomicBoolean isUsingMaxTargetIORate = new AtomicBoolean(false); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mockThreadPoolMergeExecutorService( + submittedMergeTasks, + isUsingMaxTargetIORate + ); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), maxThreadCount) + .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), maxMergeCount) + .build(); + TestThreadPoolMergeScheduler threadPoolMergeScheduler = new TestThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService + ); + int mergesToRun = randomIntBetween(0, 5); + // make sure there are more merges submitted and not run + int excessMerges = randomIntBetween(1, 10); + int mergesToSubmit = maxMergeCount + mergesToRun + excessMerges; + int mergesOutstanding = 0; + boolean expectIndexThrottling = false; + // merges are submitted, while some are also scheduled and run + while (mergesToSubmit > 0) { + isUsingMaxTargetIORate.set(randomBoolean()); + if (submittedMergeTasks.isEmpty() == false && randomBoolean()) { + // maybe schedule one submitted merge + MergeTask mergeTask = randomFrom(submittedMergeTasks); + submittedMergeTasks.remove(mergeTask); + Schedule schedule = mergeTask.schedule(); + if (schedule == Schedule.RUN) { + scheduledToRunMergeTasks.add(mergeTask); + } + } else { + if (mergesToRun > 0 && scheduledToRunMergeTasks.isEmpty() == false && randomBoolean()) { + // maybe run one scheduled merge + MergeTask mergeTask = randomFrom(scheduledToRunMergeTasks); + scheduledToRunMergeTasks.remove(mergeTask); + mergeTask.run(); + mergesToRun--; + mergesOutstanding--; + } else { + // submit one merge + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + mergesToSubmit--; + mergesOutstanding++; + } + if (isUsingMaxTargetIORate.get() && mergesOutstanding > maxMergeCount) { + expectIndexThrottling = true; + } else if (mergesOutstanding <= maxMergeCount) { + expectIndexThrottling = false; + } + } + // assert IO throttle state + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(expectIndexThrottling)); + } + // execute all remaining merges (submitted or scheduled) + while (mergesToRun > 0 || submittedMergeTasks.isEmpty() == false || scheduledToRunMergeTasks.isEmpty() == false) { + // simulate that the {@link ThreadPoolMergeExecutorService} maybe peaked IO un-throttling + isUsingMaxTargetIORate.set(randomBoolean()); + if (submittedMergeTasks.isEmpty() == false && (scheduledToRunMergeTasks.isEmpty() || randomBoolean())) { + // maybe schedule one submitted merge + MergeTask mergeTask = randomFrom(submittedMergeTasks); + submittedMergeTasks.remove(mergeTask); + Schedule schedule = mergeTask.schedule(); + if (schedule == Schedule.RUN) { + scheduledToRunMergeTasks.add(mergeTask); + } + } else { + // maybe run one scheduled merge + MergeTask mergeTask = randomFrom(scheduledToRunMergeTasks); + scheduledToRunMergeTasks.remove(mergeTask); + mergeTask.run(); + mergesToRun--; + mergesOutstanding--; + if (isUsingMaxTargetIORate.get() && mergesOutstanding > maxMergeCount) { + expectIndexThrottling = true; + } else if (mergesOutstanding <= maxMergeCount) { + expectIndexThrottling = false; + } + } + // assert IO throttle state + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(expectIndexThrottling)); + } + // all merges done + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(false)); + } + public void testMergeSourceWithFollowUpMergesRunSequentially() throws Exception { // test with min 2 allowed concurrent merges int mergeExecutorThreadCount = randomIntBetween(2, 5); @@ -493,4 +652,49 @@ private static MergeInfo getNewMergeInfo(long estimatedMergeBytes) { private static MergeInfo getNewMergeInfo(long estimatedMergeBytes, int maxNumSegments) { return new MergeInfo(randomNonNegativeInt(), estimatedMergeBytes, randomBoolean(), maxNumSegments); } + + static class TestThreadPoolMergeScheduler extends ThreadPoolMergeScheduler { + AtomicBoolean isIndexingThrottlingEnabled = new AtomicBoolean(false); + + TestThreadPoolMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + super(shardId, indexSettings, threadPoolMergeExecutorService); + } + + @Override + protected void enableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + isIndexingThrottlingEnabled.set(true); + } + + @Override + protected void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + isIndexingThrottlingEnabled.set(false); + } + + boolean isIndexingThrottlingEnabled() { + return isIndexingThrottlingEnabled.get(); + } + } + + static ThreadPoolMergeExecutorService mockThreadPoolMergeExecutorService( + List submittedMergeTasks, + AtomicBoolean isUsingMaxTargetIORate + ) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + doAnswer(invocation -> { + MergeTask mergeTask = (MergeTask) invocation.getArguments()[0]; + submittedMergeTasks.add(mergeTask); + return null; + }).when(threadPoolMergeExecutorService).submitMergeTask(any(MergeTask.class)); + doAnswer(invocation -> { + MergeTask mergeTask = (MergeTask) invocation.getArguments()[0]; + submittedMergeTasks.add(mergeTask); + return null; + }).when(threadPoolMergeExecutorService).reEnqueueBackloggedMergeTask(any(MergeTask.class)); + doAnswer(invocation -> isUsingMaxTargetIORate.get()).when(threadPoolMergeExecutorService).usingMaxTargetIORateBytesPerSec(); + return threadPoolMergeExecutorService; + } } From 6eed2521ed5817497c130e8027ea7c88fa48e8bf Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Wed, 2 Apr 2025 12:36:49 +0300 Subject: [PATCH 03/14] Slack merge throttling params for fewer merge tasks (#126016) The intent here is to aim for fewer to-do merges enqueued for execution, and to unthrottle disk IO at a faster rate when the queue grows longer. Overall this results in less merge disk throttling. Relates https://github.com/elastic/elasticsearch-benchmarks/issues/2437 https://github.com/elastic/elasticsearch/pull/120869 --- .../ThreadPoolMergeExecutorService.java | 20 +++------- .../ThreadPoolMergeExecutorServiceTests.java | 37 +++++++++---------- 2 files changed, 23 insertions(+), 34 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java index 7c78698ac6f66..7e41fffdd5357 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java @@ -87,8 +87,10 @@ public class ThreadPoolMergeExecutorService { private ThreadPoolMergeExecutorService(ThreadPool threadPool) { this.executorService = threadPool.executor(ThreadPool.Names.MERGE); this.maxConcurrentMerges = threadPool.info(ThreadPool.Names.MERGE).getMax(); - this.concurrentMergesFloorLimitForThrottling = maxConcurrentMerges * 2; - this.concurrentMergesCeilLimitForThrottling = maxConcurrentMerges * 4; + // the intent here is to throttle down whenever we submit a task and no other task is running + this.concurrentMergesFloorLimitForThrottling = 2; + this.concurrentMergesCeilLimitForThrottling = maxConcurrentMerges * 2; + assert concurrentMergesFloorLimitForThrottling <= concurrentMergesCeilLimitForThrottling; } boolean submitMergeTask(MergeTask mergeTask) { @@ -230,10 +232,10 @@ private static long newTargetIORateBytesPerSec( ); } else if (currentlySubmittedIOThrottledMergeTasks > concurrentMergesCeilLimitForThrottling && currentTargetIORateBytesPerSec < MAX_IO_RATE.getBytes()) { - // increase target IO rate by 10% (capped) + // increase target IO rate by 20% (capped) newTargetIORateBytesPerSec = Math.min( MAX_IO_RATE.getBytes(), - currentTargetIORateBytesPerSec + currentTargetIORateBytesPerSec / 10L + currentTargetIORateBytesPerSec + currentTargetIORateBytesPerSec / 5L ); } else { newTargetIORateBytesPerSec = currentTargetIORateBytesPerSec; @@ -295,14 +297,4 @@ long getTargetIORateBytesPerSec() { int getMaxConcurrentMerges() { return maxConcurrentMerges; } - - // exposed for tests - int getConcurrentMergesFloorLimitForThrottling() { - return concurrentMergesFloorLimitForThrottling; - } - - // exposed for tests - int getConcurrentMergesCeilLimitForThrottling() { - return concurrentMergesCeilLimitForThrottling; - } } diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java index 0a99c5002d5ad..8ce1645148337 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java @@ -219,18 +219,16 @@ public void testTargetIORateChangesWhenSubmittingMergeTasks() throws Exception { } long newIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); if (supportsIOThrottling) { - if (submittedIOThrottledMergeTasks.get() < threadPoolMergeExecutorService - .getConcurrentMergesFloorLimitForThrottling()) { - // assert the IO rate decreases, with a floor limit, when there are few merge tasks enqueued + if (submittedIOThrottledMergeTasks.get() < 2) { + // assert the IO rate decreases, with a floor limit, when there is just a single merge task running assertThat(newIORate, either(is(MIN_IO_RATE.getBytes())).or(lessThan(currentIORate))); - } else if (submittedIOThrottledMergeTasks.get() > threadPoolMergeExecutorService - .getConcurrentMergesCeilLimitForThrottling()) { - // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued - assertThat(newIORate, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(currentIORate))); - } else { - // assert the IO rate does NOT change when there are a couple of merge tasks enqueued - assertThat(newIORate, equalTo(currentIORate)); - } + } else if (submittedIOThrottledMergeTasks.get() > threadPoolMergeExecutorService.getMaxConcurrentMerges() * 2) { + // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued + assertThat(newIORate, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(currentIORate))); + } else { + // assert the IO rate does NOT change when there are a couple of merge tasks enqueued + assertThat(newIORate, equalTo(currentIORate)); + } } else { // assert the IO rate does not change, when the merge task doesn't support IO throttling assertThat(newIORate, equalTo(currentIORate)); @@ -375,17 +373,16 @@ private void testIORateAdjustedForSubmittedTasks( initialTasksCounter--; threadPoolMergeExecutorService.submitMergeTask(mergeTask); long newTargetIORateLimit = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); - if (currentlySubmittedMergeTaskCount.get() < threadPoolMergeExecutorService.getConcurrentMergesFloorLimitForThrottling()) { + if (currentlySubmittedMergeTaskCount.get() < 2) { // assert the IO rate decreases, with a floor limit, when there are few merge tasks enqueued assertThat(newTargetIORateLimit, either(is(MIN_IO_RATE.getBytes())).or(lessThan(targetIORateLimit.get()))); - } else if (currentlySubmittedMergeTaskCount.get() > threadPoolMergeExecutorService - .getConcurrentMergesCeilLimitForThrottling()) { - // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued - assertThat(newTargetIORateLimit, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(targetIORateLimit.get()))); - } else { - // assert the IO rate does change, when there are a couple of merge tasks enqueued - assertThat(newTargetIORateLimit, equalTo(targetIORateLimit.get())); - } + } else if (currentlySubmittedMergeTaskCount.get() > threadPoolMergeExecutorService.getMaxConcurrentMerges() * 2) { + // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued + assertThat(newTargetIORateLimit, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(targetIORateLimit.get()))); + } else { + // assert the IO rate does not change, when there are a couple of merge tasks enqueued + assertThat(newTargetIORateLimit, equalTo(targetIORateLimit.get())); + } targetIORateLimit.set(newTargetIORateLimit); } else { // execute already submitted merge task From 5727a1fa0cd8b6f35d3551ae0ec85f07507a7eec Mon Sep 17 00:00:00 2001 From: Mark Vieira Date: Thu, 20 Mar 2025 13:03:44 -0700 Subject: [PATCH 04/14] Fix failure in ScalingThreadPoolTests after addition of merge scheduler (#125245) --- .../org/elasticsearch/threadpool/ScalingThreadPoolTests.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java b/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java index d0b4f90948a4e..aaffcb4085f12 100644 --- a/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java +++ b/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java @@ -119,6 +119,7 @@ private int expectedSize(final String threadPoolName, final int numberOfProcesso sizes.put(ThreadPool.Names.SNAPSHOT_META, n -> Math.min(n * 3, 50)); sizes.put(ThreadPool.Names.FETCH_SHARD_STARTED, ThreadPool::twiceAllocatedProcessors); sizes.put(ThreadPool.Names.FETCH_SHARD_STORE, ThreadPool::twiceAllocatedProcessors); + sizes.put(ThreadPool.Names.MERGE, Function.identity()); return sizes.get(threadPoolName).apply(numberOfProcessors); } From a255537a60504f691a1fd4f71b69d2cf75930c66 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Wed, 2 Apr 2025 16:07:18 +0300 Subject: [PATCH 05/14] Fix ThreadPoolMergeSchedulerStressTestIT testMergingFallsBehindAndThenCatchesUp (#125956) We don't know how many semaphore merge permits we need to release, or how many are already released. Fixes #125744 --- .../ThreadPoolMergeSchedulerStressTestIT.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java index 1743ca1996055..f3a9e5db28047 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java @@ -79,12 +79,6 @@ public static class TestEnginePlugin extends Plugin implements EnginePlugin { final Semaphore runMergeSemaphore = new Semaphore(initialRunMergesCount); final int waitMergesEnqueuedCount = randomIntBetween(50, 100); - void allowAllMerging() { - // even when indexing is done, queued and backlogged merges can themselves trigger further merging - // don't let this test be bothered by that, and simply let all merging run unhindered - runMergeSemaphore.release(Integer.MAX_VALUE - initialRunMergesCount); - } - class TestInternalEngine extends org.elasticsearch.index.engine.InternalEngine { TestInternalEngine(EngineConfig engineConfig) { @@ -265,10 +259,13 @@ public void testMergingFallsBehindAndThenCatchesUp() throws Exception { for (Thread indexingThread : indexingThreads) { indexingThread.join(); } - // unblock merge threads - testEnginePlugin.allowAllMerging(); + // even when indexing is done, queued and backlogged merges can themselves trigger further merging + // don't let this test be bothered by that, and simply unblock all merges + // 100k is a fudge value, but there's no easy way to find a smartest one here + testEnginePlugin.runMergeSemaphore.release(100_000); // await all merging to catch up assertBusy(() -> { + assert testEnginePlugin.runMergeSemaphore.availablePermits() > 0 : "some merges are blocked, test is broken"; assertThat(testEnginePlugin.runningMergesSet.size(), is(0)); assertThat(testEnginePlugin.enqueuedMergesSet.size(), is(0)); testEnginePlugin.mergeExecutorServiceReference.get().allDone(); From c43805ebcc66ecbeba21b5348407c084e18add85 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Wed, 2 Apr 2025 17:13:46 +0300 Subject: [PATCH 06/14] Fix testMergeSourceWithFollowUpMergesRunSequentially (#126050) Fixes #125639 Relates #120869 --- .../index/engine/ThreadPoolMergeSchedulerTests.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java index ae9168357eb32..8919ec46ba176 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -390,11 +390,13 @@ public void testMergeSourceWithFollowUpMergesRunSequentially() throws Exception }).when(mergeSource).merge(any(OneMerge.class)); // trigger run merges on the merge source threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); - do { + boolean done = false; + while (done == false) { // let merges run, but wait for the in-progress one to signal it is running nextMergeSemaphore.acquire(); + done = runMergeIdx.get() >= followUpMergeCount; runMergeSemaphore.release(); - } while (runMergeIdx.get() < followUpMergeCount); + } assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); } } From db2ed34404818be7b87d2f1f054e237a6cd7b2d7 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Thu, 3 Apr 2025 11:11:55 +0300 Subject: [PATCH 07/14] Fix ThreadPoolMergeSchedulerTests testSchedulerCloseWaitsForRunningMerge (#126110) Fixes #125236 --- .../index/engine/ThreadPoolMergeSchedulerTests.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java index 8919ec46ba176..d407e865efbaf 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -508,18 +508,21 @@ public void testSchedulerCloseWaitsForRunningMerge() throws Exception { ) ) { CountDownLatch mergeDoneLatch = new CountDownLatch(1); + CountDownLatch mergeRunningLatch = new CountDownLatch(1); MergeSource mergeSource = mock(MergeSource.class); OneMerge oneMerge = mock(OneMerge.class); when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); doAnswer(invocation -> { + mergeRunningLatch.countDown(); OneMerge merge = (OneMerge) invocation.getArguments()[0]; assertFalse(merge.isAborted()); // wait to be signalled before completing the merge mergeDoneLatch.await(); return null; }).when(mergeSource).merge(any(OneMerge.class)); + // submit the merge threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); Thread t = new Thread(() -> { try { @@ -531,6 +534,8 @@ public void testSchedulerCloseWaitsForRunningMerge() throws Exception { t.start(); try { assertTrue(t.isAlive()); + // wait for the merge to actually run + mergeRunningLatch.await(); // ensure the merge scheduler is effectively "closed" assertBusy(() -> { MergeSource mergeSource2 = mock(MergeSource.class); From 646069db7c42cd8b2e2f85ba421b19a88c02e9e5 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Thu, 3 Apr 2025 14:09:38 +0300 Subject: [PATCH 08/14] Fix IndexStatsIT (#126113) Ensures proper cleanup in the testThrottleStats test. Fixes #125910 #125907 #125912 --- .../indices/stats/IndexStatsIT.java | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java index afd9e98b3e7d9..06972d09d609c 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java @@ -466,27 +466,26 @@ public void testNonThrottleStats() throws Exception { assertThat(stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis(), equalTo(0L)); } - public void testThrottleStats() { + public void testThrottleStats() throws Exception { assertAcked( - prepareCreate("test").setSettings( + prepareCreate("test_throttle_stats_index").setSettings( settingsBuilder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, "1") .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, "0") .put(MergePolicyConfig.INDEX_MERGE_POLICY_MAX_MERGE_AT_ONCE_SETTING.getKey(), "2") .put(MergePolicyConfig.INDEX_MERGE_POLICY_SEGMENTS_PER_TIER_SETTING.getKey(), "2") .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), "1") .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), "1") + .put(MergeSchedulerConfig.AUTO_THROTTLE_SETTING.getKey(), "true") .put(IndexSettings.INDEX_TRANSLOG_DURABILITY_SETTING.getKey(), Translog.Durability.ASYNC.name()) ) ); - ensureGreen(); + ensureGreen("test_throttle_stats_index"); // make sure we see throttling kicking in: AtomicBoolean done = new AtomicBoolean(); AtomicLong termUpTo = new AtomicLong(); - long start = System.currentTimeMillis(); - for (int threadIdx = 0; threadIdx < 5; threadIdx++) { - int finalThreadIdx = threadIdx; - new Thread(() -> { - IndicesStatsResponse stats; + Thread[] indexingThreads = new Thread[5]; + for (int threadIdx = 0; threadIdx < indexingThreads.length; threadIdx++) { + indexingThreads[threadIdx] = new Thread(() -> { while (done.get() == false) { for (int i = 0; i < 100; i++) { // Provoke slowish merging by making many unique terms: @@ -495,30 +494,35 @@ public void testThrottleStats() { sb.append(' '); sb.append(termUpTo.incrementAndGet()); } - prepareIndex("test").setId("" + termUpTo.get()).setSource("field" + (i % 10), sb.toString()).get(); + prepareIndex("test_throttle_stats_index").setId("" + termUpTo.get()) + .setSource("field" + (i % 10), sb.toString()) + .get(); if (i % 2 == 0) { - refresh(); + refresh("test_throttle_stats_index"); } } - refresh(); - if (finalThreadIdx == 0) { - stats = indicesAdmin().prepareStats().get(); - done.set(stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis() > 0); - } - if (System.currentTimeMillis() - start > 300 * 1000) { // Wait 5 minutes for throttling to kick in - done.set(true); - fail("index throttling didn't kick in after 5 minutes of intense merging"); - } + refresh("test_throttle_stats_index"); } - }).start(); + }); + indexingThreads[threadIdx].start(); + } + + assertBusy(() -> { + IndicesStatsResponse stats = indicesAdmin().prepareStats("test_throttle_stats_index").get(); + assertTrue(stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis() > 0); + done.set(true); + }, 5L, TimeUnit.MINUTES); + + for (Thread indexingThread : indexingThreads) { + indexingThread.join(); } // Optimize & flush and wait; else we sometimes get a "Delete Index failed - not acked" // when ESIntegTestCase.after tries to remove indices created by the test: - logger.info("test: now optimize"); - indicesAdmin().prepareForceMerge("test").get(); - flush(); - logger.info("test: test done"); + logger.info("test throttle stats: now optimize"); + indicesAdmin().prepareForceMerge("test_throttle_stats_index").get(); + flush("test_throttle_stats_index"); + logger.info("test throttle stats: test done"); } public void testSimpleStats() throws Exception { From 55a477ab66d684a5bb8d5abe21a33024c3c77172 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Sun, 8 Jun 2025 17:26:10 +0300 Subject: [PATCH 09/14] Threadpool merge executor is aware of available disk space (#127613) This PR introduces 3 new settings: indices.merge.disk.check_interval, indices.merge.disk.watermark.high, and indices.merge.disk.watermark.high.max_headroom that control if the threadpool merge executor starts executing new merges when the disk space is getting low. The intent of this change is to avoid the situation where in-progress merges exhaust the available disk space on the node's local filesystem. To this end, the thread pool merge executor periodically monitors the available disk space, as well as the current disk space estimates required by all in-progress (currently running) merges on the node, and will NOT schedule any new merges if the disk space is getting low (by default below the 5% limit of the total disk space, or 100 GB, whichever is smaller (same as the disk allocation flood stage level)). --- docs/changelog/127613.yaml | 5 + .../common/settings/ClusterSettings.java | 4 + .../ThreadPoolMergeExecutorService.java | 558 ++++++++- .../engine/ThreadPoolMergeScheduler.java | 30 +- .../elasticsearch/indices/IndicesService.java | 13 +- .../elasticsearch/index/IndexModuleTests.java | 8 +- ...oolMergeExecutorServiceDiskSpaceTests.java | 1023 +++++++++++++++++ .../ThreadPoolMergeExecutorServiceTests.java | 292 ++++- .../engine/ThreadPoolMergeSchedulerTests.java | 43 +- .../index/shard/RefreshListenersTests.java | 13 +- .../index/engine/EngineTestCase.java | 9 +- .../index/shard/IndexShardTestCase.java | 10 +- .../index/engine/FollowingEngineTests.java | 13 +- 13 files changed, 1924 insertions(+), 97 deletions(-) create mode 100644 docs/changelog/127613.yaml create mode 100644 server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java diff --git a/docs/changelog/127613.yaml b/docs/changelog/127613.yaml new file mode 100644 index 0000000000000..de043e209b32e --- /dev/null +++ b/docs/changelog/127613.yaml @@ -0,0 +1,5 @@ +pr: 127613 +summary: Threadpool merge executor is aware of available disk space +area: Engine +type: feature +issues: [] diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index dbc18468aa9bd..164012670743a 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -87,6 +87,7 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexingPressure; import org.elasticsearch.index.MergePolicyConfig; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.indices.IndexingMemoryController; import org.elasticsearch.indices.IndicesQueryCache; @@ -621,6 +622,9 @@ public void apply(Settings value, Settings current, Settings previous) { MergePolicyConfig.DEFAULT_MAX_MERGED_SEGMENT_SETTING, MergePolicyConfig.DEFAULT_MAX_TIME_BASED_MERGED_SEGMENT_SETTING, ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING, TransportService.ENABLE_STACK_OVERFLOW_AVOIDANCE, DataStreamGlobalRetentionSettings.DATA_STREAMS_DEFAULT_RETENTION_SETTING, DataStreamGlobalRetentionSettings.DATA_STREAMS_MAX_RETENTION_SETTING, diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java index 7e41fffdd5357..32abd2dd8ada2 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java @@ -9,27 +9,153 @@ package org.elasticsearch.index.engine; -import org.elasticsearch.common.settings.Settings; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.unit.RelativeByteSizeValue; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.Releasable; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.monitor.fs.FsInfo; +import org.elasticsearch.threadpool.Scheduler; import org.elasticsearch.threadpool.ThreadPool; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; import java.util.Comparator; +import java.util.IdentityHashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; import java.util.Set; import java.util.concurrent.ExecutorService; -import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; import java.util.function.LongUnaryOperator; +import java.util.function.ToLongFunction; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING; import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING; +import static org.elasticsearch.monitor.fs.FsProbe.getFSInfo; -public class ThreadPoolMergeExecutorService { +public class ThreadPoolMergeExecutorService implements Closeable { + /** How frequently we check disk usage (default: 5 seconds). */ + public static final Setting INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING = Setting.positiveTimeSetting( + "indices.merge.disk.check_interval", + TimeValue.timeValueSeconds(5), + Property.Dynamic, + Property.NodeScope + ); + /** + * The occupied disk space threshold beyond which NO new merges are started. + * Conservatively, the estimated temporary disk space required for the to-be-started merge is counted as occupied disk space. + * Defaults to the routing allocation flood stage limit value (beyond which shards are toggled read-only). + */ + public static final Setting INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING = new Setting<>( + "indices.merge.disk.watermark.high", + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, + (s) -> RelativeByteSizeValue.parseRelativeByteSizeValue(s, "indices.merge.disk.watermark.high"), + new Setting.Validator<>() { + @Override + public void validate(RelativeByteSizeValue value) {} + + @Override + public void validate(RelativeByteSizeValue value, Map, Object> settings, boolean isPresent) { + if (isPresent && settings.get(USE_THREAD_POOL_MERGE_SCHEDULER_SETTING).equals(Boolean.FALSE)) { + throw new IllegalArgumentException( + "indices merge watermark setting is only effective when [" + + USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey() + + "] is set to [true]" + ); + } + } + + @Override + public Iterator> settings() { + List> res = List.of(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, USE_THREAD_POOL_MERGE_SCHEDULER_SETTING); + return res.iterator(); + } + }, + Property.Dynamic, + Property.NodeScope + ); + /** + * The available disk space headroom below which NO new merges are started. + * Conservatively, the estimated temporary disk space required for the to-be-started merge is NOT counted as available disk space. + * Defaults to the routing allocation flood stage headroom value (below which shards are toggled read-only), + * unless the merge occupied disk space threshold is specified, in which case the default headroom value here is unset. + */ + public static final Setting INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING = new Setting<>( + "indices.merge.disk.watermark.high.max_headroom", + (settings) -> { + // if the user explicitly set a value for the occupied disk space threshold, disable the implicit headroom value + if (INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.exists(settings)) { + return "-1"; + } else { + return CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.get(settings).toString(); + } + }, + (s) -> ByteSizeValue.parseBytesSizeValue(s, "indices.merge.disk.watermark.high.max_headroom"), + new Setting.Validator<>() { + @Override + public void validate(ByteSizeValue value) {} + + @Override + public void validate(final ByteSizeValue value, final Map, Object> settings, boolean isPresent) { + if (isPresent) { + if (value.equals(ByteSizeValue.MINUS_ONE)) { + throw new IllegalArgumentException( + "setting a headroom value to less than 0 is not supported, use [null] value to unset" + ); + } + if (settings.get(USE_THREAD_POOL_MERGE_SCHEDULER_SETTING).equals(Boolean.FALSE)) { + throw new IllegalArgumentException( + "indices merge max headroom setting is only effective when [" + + USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey() + + "] is set to [true]" + ); + } + } + final RelativeByteSizeValue highWatermark = (RelativeByteSizeValue) settings.get(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING); + final ByteSizeValue highHeadroom = (ByteSizeValue) settings.get(INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING); + if (highWatermark.isAbsolute() && highHeadroom.equals(ByteSizeValue.MINUS_ONE) == false) { + throw new IllegalArgumentException( + "indices merge max headroom setting is set, but indices merge disk watermark value is not a relative value [" + + highWatermark.getStringRep() + + "]" + ); + } + } + + @Override + public Iterator> settings() { + List> res = List.of( + INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, + INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, + USE_THREAD_POOL_MERGE_SCHEDULER_SETTING + ); + return res.iterator(); + } + }, + Property.Dynamic, + Property.NodeScope + ); /** * Floor for IO write rate limit of individual merge tasks (we will never go any lower than this) */ @@ -50,11 +176,10 @@ public class ThreadPoolMergeExecutorService { /** * The merge tasks that are waiting execution. This does NOT include backlogged or currently executing merge tasks. * For instance, this can be empty while there are backlogged merge tasks awaiting re-enqueuing. + * The budget (estimation) for a merge task is the disk space (still) required for it to complete. As the merge progresses, + * its budget decreases (as the bytes already written have been incorporated into the filesystem stats about the used disk space). */ - private final PriorityBlockingQueue queuedMergeTasks = new PriorityBlockingQueue<>( - 64, - Comparator.comparingLong(MergeTask::estimatedMergeSize) - ); + private final MergeTaskPriorityBlockingQueue queuedMergeTasks = new MergeTaskPriorityBlockingQueue(); /** * The set of all merge tasks currently being executed by merge threads from the pool. * These are tracked notably in order to be able to update their disk IO throttle rate, after they have started, while executing. @@ -72,29 +197,43 @@ public class ThreadPoolMergeExecutorService { private final int maxConcurrentMerges; private final int concurrentMergesFloorLimitForThrottling; private final int concurrentMergesCeilLimitForThrottling; + private final AvailableDiskSpacePeriodicMonitor availableDiskSpacePeriodicMonitor; public static @Nullable ThreadPoolMergeExecutorService maybeCreateThreadPoolMergeExecutorService( ThreadPool threadPool, - Settings settings + ClusterSettings clusterSettings, + NodeEnvironment nodeEnvironment ) { - if (ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.get(settings)) { - return new ThreadPoolMergeExecutorService(threadPool); + if (clusterSettings.get(USE_THREAD_POOL_MERGE_SCHEDULER_SETTING)) { + return new ThreadPoolMergeExecutorService(threadPool, clusterSettings, nodeEnvironment); } else { + // register no-op setting update consumers so that setting validations work properly + // (some validations are bypassed if there are no update consumers registered), + // i.e. to reject watermark and max headroom updates if the thread pool merge scheduler is disabled + clusterSettings.addSettingsUpdateConsumer(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, (ignored) -> {}); + clusterSettings.addSettingsUpdateConsumer(INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, (ignored) -> {}); + clusterSettings.addSettingsUpdateConsumer(INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING, (ignored) -> {}); return null; } } - private ThreadPoolMergeExecutorService(ThreadPool threadPool) { + private ThreadPoolMergeExecutorService(ThreadPool threadPool, ClusterSettings clusterSettings, NodeEnvironment nodeEnvironment) { this.executorService = threadPool.executor(ThreadPool.Names.MERGE); this.maxConcurrentMerges = threadPool.info(ThreadPool.Names.MERGE).getMax(); // the intent here is to throttle down whenever we submit a task and no other task is running this.concurrentMergesFloorLimitForThrottling = 2; this.concurrentMergesCeilLimitForThrottling = maxConcurrentMerges * 2; assert concurrentMergesFloorLimitForThrottling <= concurrentMergesCeilLimitForThrottling; + this.availableDiskSpacePeriodicMonitor = startDiskSpaceMonitoring( + threadPool, + nodeEnvironment.dataPaths(), + clusterSettings, + (availableDiskSpaceByteSize) -> this.queuedMergeTasks.updateBudget(availableDiskSpaceByteSize.getBytes()) + ); } boolean submitMergeTask(MergeTask mergeTask) { - assert mergeTask.isRunning() == false; + assert mergeTask.hasStartedRunning() == false; // first enqueue the runnable that runs exactly one merge task (the smallest it can find) if (enqueueMergeTaskExecution() == false) { // if the thread pool cannot run the merge, just abort it @@ -133,11 +272,20 @@ boolean submitMergeTask(MergeTask mergeTask) { } void reEnqueueBackloggedMergeTask(MergeTask mergeTask) { - queuedMergeTasks.add(mergeTask); + assert mergeTask.hasStartedRunning() == false; + enqueueMergeTask(mergeTask); + } + + private void enqueueMergeTask(MergeTask mergeTask) { + // To ensure that for a given merge onMergeQueued is called before onMergeAborted or onMergeCompleted, we call onMergeQueued + // before adding the merge task to the queue. Adding to the queue should not fail. + mergeEventListeners.forEach(l -> l.onMergeQueued(mergeTask.getOnGoingMerge(), mergeTask.getMergeMemoryEstimateBytes())); + boolean added = queuedMergeTasks.enqueue(mergeTask); + assert added; } public boolean allDone() { - return queuedMergeTasks.isEmpty() && runningMergeTasks.isEmpty() && ioThrottledMergeTasksCount.get() == 0L; + return queuedMergeTasks.isQueueEmpty() && runningMergeTasks.isEmpty() && ioThrottledMergeTasksCount.get() == 0L; } /** @@ -150,10 +298,13 @@ private boolean enqueueMergeTaskExecution() { // one such runnable always executes a SINGLE merge task from the queue // this is important for merge queue statistics, i.e. the executor's queue size represents the current amount of merges while (true) { - MergeTask smallestMergeTask; + PriorityBlockingQueueWithBudget.ElementWithReleasableBudget smallestMergeTaskWithReleasableBudget; try { - // will block if there are backlogged merges until they're enqueued again - smallestMergeTask = queuedMergeTasks.take(); + // Will block if there are backlogged merges until they're enqueued again + // (for e.g. if the per-shard concurrent merges count limit is reached). + // Will also block if there is insufficient budget (i.e. estimated available disk space + // for the smallest merge task to run to completion) + smallestMergeTaskWithReleasableBudget = queuedMergeTasks.take(); } catch (InterruptedException e) { // An active worker thread has been interrupted while waiting for backlogged merges to be re-enqueued. // In this case, we terminate the worker thread promptly and forget about the backlogged merges. @@ -163,18 +314,24 @@ private boolean enqueueMergeTaskExecution() { // is also drained, so any queued merge tasks are also forgotten. break; } - // let the task's scheduler decide if it can actually run the merge task now - ThreadPoolMergeScheduler.Schedule schedule = smallestMergeTask.schedule(); - if (schedule == RUN) { - runMergeTask(smallestMergeTask); - break; - } else if (schedule == ABORT) { - abortMergeTask(smallestMergeTask); - break; - } else { - assert schedule == BACKLOG; - // the merge task is backlogged by the merge scheduler, try to get the next smallest one - // it's then the duty of the said merge scheduler to re-enqueue the backlogged merge task when it can be run + try (var ignored = smallestMergeTaskWithReleasableBudget) { + MergeTask smallestMergeTask = smallestMergeTaskWithReleasableBudget.element(); + // let the task's scheduler decide if it can actually run the merge task now + ThreadPoolMergeScheduler.Schedule schedule = smallestMergeTask.schedule(); + if (schedule == RUN) { + runMergeTask(smallestMergeTask); + break; + } else if (schedule == ABORT) { + abortMergeTask(smallestMergeTask); + break; + } else { + assert schedule == BACKLOG; + // The merge task is backlogged by the merge scheduler, try to get the next smallest one. + // It's then the duty of the said merge scheduler to re-enqueue the backlogged merge task when + // itself decides that the merge task could be run. Note that it is possible that this merge + // task is re-enqueued and re-took before the budget hold-up here is released upon the next + // {@link PriorityBlockingQueueWithBudget#updateBudget} invocation. + } } } }); @@ -187,7 +344,7 @@ private boolean enqueueMergeTaskExecution() { } private void runMergeTask(MergeTask mergeTask) { - assert mergeTask.isRunning() == false; + assert mergeTask.hasStartedRunning() == false; boolean added = runningMergeTasks.add(mergeTask); assert added : "starting merge task [" + mergeTask + "] registered as already running"; try { @@ -205,7 +362,7 @@ private void runMergeTask(MergeTask mergeTask) { } private void abortMergeTask(MergeTask mergeTask) { - assert mergeTask.isRunning() == false; + assert mergeTask.hasStartedRunning() == false; assert runningMergeTasks.contains(mergeTask) == false; try { mergeTask.abort(); @@ -216,6 +373,331 @@ private void abortMergeTask(MergeTask mergeTask) { } } + /** + * Start monitoring the available disk space, and update the available budget for running merge tasks + * Note: this doesn't work correctly for nodes with multiple data paths, as it only considers the data path with the MOST + * available disk space. In this case, merges will NOT be blocked for shards on data paths with insufficient available + * disk space, as long as a single data path has enough available disk space to run merges for any shards that it stores + * (i.e. multiple data path is not really supported when blocking merges due to insufficient available disk space + * (but nothing blows up either, if using multiple data paths)) + */ + static AvailableDiskSpacePeriodicMonitor startDiskSpaceMonitoring( + ThreadPool threadPool, + NodeEnvironment.DataPath[] dataPaths, + ClusterSettings clusterSettings, + Consumer availableDiskSpaceUpdateConsumer + ) { + AvailableDiskSpacePeriodicMonitor availableDiskSpacePeriodicMonitor = new AvailableDiskSpacePeriodicMonitor( + dataPaths, + threadPool, + clusterSettings.get(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING), + clusterSettings.get(INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING), + clusterSettings.get(INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING), + availableDiskSpaceByteSize -> { + if (availableDiskSpaceByteSize.equals(ByteSizeValue.MINUS_ONE)) { + // The merge executor is currently unaware of the available disk space because of an error. + // Merges are NOT blocked if the available disk space is insufficient. + availableDiskSpaceUpdateConsumer.accept(ByteSizeValue.ofBytes(Long.MAX_VALUE)); + } else { + availableDiskSpaceUpdateConsumer.accept(availableDiskSpaceByteSize); + } + } + ); + if (availableDiskSpacePeriodicMonitor.isScheduled() == false) { + // in case the disk space monitor starts off as disabled, then make sure that merging is NOT blocked + // (in the other case, merging IS blocked until the first update for the available disk space) + availableDiskSpaceUpdateConsumer.accept(ByteSizeValue.ofBytes(Long.MAX_VALUE)); + } + clusterSettings.addSettingsUpdateConsumer( + INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, + availableDiskSpacePeriodicMonitor::setHighStageWatermark + ); + clusterSettings.addSettingsUpdateConsumer( + INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, + availableDiskSpacePeriodicMonitor::setHighStageMaxHeadroom + ); + clusterSettings.addSettingsUpdateConsumer( + INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING, + availableDiskSpacePeriodicMonitor::setCheckInterval + ); + return availableDiskSpacePeriodicMonitor; + } + + static class AvailableDiskSpacePeriodicMonitor implements Closeable { + private static final Logger LOGGER = LogManager.getLogger(AvailableDiskSpacePeriodicMonitor.class); + private final NodeEnvironment.DataPath[] dataPaths; + private final ThreadPool threadPool; + private volatile RelativeByteSizeValue highStageWatermark; + private volatile ByteSizeValue highStageMaxHeadroom; + private volatile TimeValue checkInterval; + private final Consumer updateConsumer; + private volatile boolean closed; + private volatile Scheduler.Cancellable monitor; + + AvailableDiskSpacePeriodicMonitor( + NodeEnvironment.DataPath[] dataPaths, + ThreadPool threadPool, + RelativeByteSizeValue highStageWatermark, + ByteSizeValue highStageMaxHeadroom, + TimeValue checkInterval, + Consumer updateConsumer + ) { + this.dataPaths = dataPaths; + this.threadPool = threadPool; + this.highStageWatermark = highStageWatermark; + this.highStageMaxHeadroom = highStageMaxHeadroom; + this.checkInterval = checkInterval; + this.updateConsumer = updateConsumer; + this.closed = false; + reschedule(); + } + + void setCheckInterval(TimeValue checkInterval) { + this.checkInterval = checkInterval; + reschedule(); + } + + void setHighStageWatermark(RelativeByteSizeValue highStageWatermark) { + this.highStageWatermark = highStageWatermark; + } + + void setHighStageMaxHeadroom(ByteSizeValue highStageMaxHeadroom) { + this.highStageMaxHeadroom = highStageMaxHeadroom; + } + + private synchronized void reschedule() { + if (monitor != null) { + monitor.cancel(); + } + if (closed == false && checkInterval.duration() > 0) { + // do an eager run, + // in order to increase responsiveness in case the period is long and something blocks waiting for the first update + threadPool.generic().execute(this::run); + monitor = threadPool.scheduleWithFixedDelay(this::run, checkInterval, threadPool.generic()); + } else { + monitor = null; + } + } + + boolean isScheduled() { + return monitor != null && closed == false; + } + + @Override + public void close() throws IOException { + closed = true; + reschedule(); + } + + private void run() { + if (closed) { + return; + } + FsInfo.Path mostAvailablePath = null; + IOException fsInfoException = null; + for (NodeEnvironment.DataPath dataPath : dataPaths) { + try { + FsInfo.Path fsInfo = getFSInfo(dataPath); // uncached + if (mostAvailablePath == null || mostAvailablePath.getAvailable().getBytes() < fsInfo.getAvailable().getBytes()) { + mostAvailablePath = fsInfo; + } + } catch (IOException e) { + if (fsInfoException == null) { + fsInfoException = e; + } else { + fsInfoException.addSuppressed(e); + } + } + } + if (fsInfoException != null) { + LOGGER.warn("unexpected exception reading filesystem info", fsInfoException); + } + if (mostAvailablePath == null) { + LOGGER.error("Cannot read filesystem info for node data paths " + Arrays.toString(dataPaths)); + updateConsumer.accept(ByteSizeValue.MINUS_ONE); + return; + } + long mostAvailableDiskSpaceBytes = mostAvailablePath.getAvailable().getBytes(); + // subtract the configured free disk space threshold + mostAvailableDiskSpaceBytes -= getFreeBytesThreshold(mostAvailablePath.getTotal(), highStageWatermark, highStageMaxHeadroom) + .getBytes(); + // clamp available space to 0 + long maxMergeSizeLimit = Math.max(0L, mostAvailableDiskSpaceBytes); + updateConsumer.accept(ByteSizeValue.ofBytes(maxMergeSizeLimit)); + } + + private static ByteSizeValue getFreeBytesThreshold( + ByteSizeValue total, + RelativeByteSizeValue watermark, + ByteSizeValue maxHeadroom + ) { + // If bytes are given, they can be readily returned as free bytes. + // If percentages are given, we need to calculate the free bytes. + if (watermark.isAbsolute()) { + return watermark.getAbsolute(); + } + return ByteSizeValue.subtract(total, watermark.calculateValue(total, maxHeadroom)); + } + } + + static class MergeTaskPriorityBlockingQueue extends PriorityBlockingQueueWithBudget { + MergeTaskPriorityBlockingQueue() { + // start with 0 budget (so takes on this queue will always block until {@link #updateBudget} is invoked) + // use the estimated *remaining* merge size as the budget function so that the disk space budget of taken (in-use) elements is + // updated according to the remaining disk space requirements of the currently running merge tasks + super(MergeTask::estimatedRemainingMergeSize, 0L); + } + + // exposed for tests + long getAvailableBudget() { + return super.availableBudget; + } + + // exposed for tests + MergeTask peekQueue() { + return enqueuedByBudget.peek(); + } + } + + /** + * Similar to a regular priority queue, but the {@link #take()} operation will also block if the smallest element + * (according to the specified "budget" function) is larger than an updatable limit budget. + */ + static class PriorityBlockingQueueWithBudget { + private final ToLongFunction budgetFunction; + protected final PriorityQueue enqueuedByBudget; + private final IdentityHashMap unreleasedBudgetPerElement; + private final ReentrantLock lock; + private final Condition elementAvailable; + protected long availableBudget; + + PriorityBlockingQueueWithBudget(ToLongFunction budgetFunction, long initialAvailableBudget) { + this.budgetFunction = budgetFunction; + this.enqueuedByBudget = new PriorityQueue<>(64, Comparator.comparingLong(budgetFunction)); + this.unreleasedBudgetPerElement = new IdentityHashMap<>(); + this.lock = new ReentrantLock(); + this.elementAvailable = lock.newCondition(); + this.availableBudget = initialAvailableBudget; + } + + boolean enqueue(E e) { + final ReentrantLock lock = this.lock; + lock.lock(); + try { + enqueuedByBudget.offer(e); + elementAvailable.signal(); + } finally { + lock.unlock(); + } + return true; + } + + /** + * Dequeues the smallest element (according to the specified "budget" function) if its budget is below the available limit. + * This method invocation blocks if the queue is empty or the element's budget is above the available limit. + */ + ElementWithReleasableBudget take() throws InterruptedException { + final ReentrantLock lock = this.lock; + lock.lockInterruptibly(); + try { + E peek; + long peekBudget; + // blocks until the smallest budget element fits the currently available budget + while ((peek = enqueuedByBudget.peek()) == null || (peekBudget = budgetFunction.applyAsLong(peek)) > availableBudget) { + elementAvailable.await(); + } + // deducts and holds up that element's budget from the available budget + return newElementWithReleasableBudget(enqueuedByBudget.poll(), peekBudget); + } finally { + lock.unlock(); + } + } + + /** + * Updates the available budged given the passed-in argument, from which it deducts the budget hold up by taken elements + * that are still in use. The budget of in-use elements is also updated (by re-applying the budget function). + * The newly updated budget is used to potentially block {@link #take()} operations if the smallest-budget enqueued element + * is over this newly computed available budget. + */ + void updateBudget(long availableBudget) { + final ReentrantLock lock = this.lock; + lock.lock(); + try { + this.availableBudget = availableBudget; + // update the per-element budget (these are all the elements that are using any budget) + unreleasedBudgetPerElement.replaceAll((e, v) -> budgetFunction.applyAsLong(e.element())); + // available budget is decreased by the used per-element budget (for all dequeued elements that are still in use) + this.availableBudget -= unreleasedBudgetPerElement.values().stream().mapToLong(i -> i).sum(); + elementAvailable.signalAll(); + } finally { + lock.unlock(); + } + } + + boolean isQueueEmpty() { + return enqueuedByBudget.isEmpty(); + } + + int queueSize() { + return enqueuedByBudget.size(); + } + + private ElementWithReleasableBudget newElementWithReleasableBudget(E element, long budget) { + ElementWithReleasableBudget elementWithReleasableBudget = new ElementWithReleasableBudget(element); + assert this.lock.isHeldByCurrentThread(); + // the taken element holds up some budget + var prev = this.unreleasedBudgetPerElement.put(elementWithReleasableBudget, budget); + assert prev == null; + this.availableBudget -= budget; + assert this.availableBudget >= 0L; + return elementWithReleasableBudget; + } + + private void release(ElementWithReleasableBudget elementWithReleasableBudget) { + final ReentrantLock lock = this.lock; + lock.lock(); + try { + assert elementWithReleasableBudget.isClosed() == false; + // when the taken element is not used anymore, it will not influence subsequent computations for available budget, + // but its allotted budget is not yet released + var val = unreleasedBudgetPerElement.remove(elementWithReleasableBudget); + assert val != null; + } finally { + lock.unlock(); + } + } + + private boolean isReleased(ElementWithReleasableBudget elementWithReleasableBudget) { + return unreleasedBudgetPerElement.containsKey(elementWithReleasableBudget) == false; + } + + class ElementWithReleasableBudget implements Releasable { + private final E element; + + private ElementWithReleasableBudget(E element) { + this.element = element; + } + + /** + * Must be invoked when the caller is done with the element that it previously took from the queue. + * The budget it's holding is not immediately released, but the next time {@link #updateBudget(long)} + * is invoked this element's budget won't deduct from the total available. + */ + @Override + public void close() { + PriorityBlockingQueueWithBudget.this.release(this); + } + + boolean isClosed() { + return PriorityBlockingQueueWithBudget.this.isReleased(this); + } + + E element() { + return element; + } + } + } + private static long newTargetIORateBytesPerSec( long currentTargetIORateBytesPerSec, int currentlySubmittedIOThrottledMergeTasks, @@ -284,8 +766,13 @@ Set getRunningMergeTasks() { } // exposed for tests - PriorityBlockingQueue getQueuedMergeTasks() { - return queuedMergeTasks; + int getMergeTasksQueueLength() { + return queuedMergeTasks.queueSize(); + } + + // exposed for tests + long getDiskSpaceAvailableForNewMergeTasks() { + return queuedMergeTasks.getAvailableBudget(); } // exposed for tests and stats @@ -297,4 +784,9 @@ long getTargetIORateBytesPerSec() { int getMaxConcurrentMerges() { return maxConcurrentMerges; } + + @Override + public void close() throws IOException { + availableDiskSpacePeriodicMonitor.close(); + } } diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java index f645edaff64a8..09efe3e4bf64e 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java @@ -55,7 +55,7 @@ public class ThreadPoolMergeScheduler extends MergeScheduler implements Elastics private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final PriorityQueue backloggedMergeTasks = new PriorityQueue<>( 16, - Comparator.comparingLong(MergeTask::estimatedMergeSize) + Comparator.comparingLong(MergeTask::estimatedRemainingMergeSize) ); private final Map runningMergeTasks = new HashMap<>(); // set when incoming merges should be throttled (i.e. restrict the indexing rate) @@ -214,7 +214,7 @@ private void checkMergeTaskThrottling() { // exposed for tests // synchronized so that {@code #closed}, {@code #runningMergeTasks} and {@code #backloggedMergeTasks} are modified atomically synchronized Schedule schedule(MergeTask mergeTask) { - assert mergeTask.isRunning() == false; + assert mergeTask.hasStartedRunning() == false; if (closed) { // do not run or backlog tasks when closing the merge scheduler, instead abort them return Schedule.ABORT; @@ -223,6 +223,7 @@ synchronized Schedule schedule(MergeTask mergeTask) { assert added : "starting merge task [" + mergeTask + "] registered as already running"; return Schedule.RUN; } else { + assert mergeTask.hasStartedRunning() == false; backloggedMergeTasks.add(mergeTask); return Schedule.BACKLOG; } @@ -337,8 +338,14 @@ public void setIORateLimit(long ioRateLimitBytesPerSec) { this.rateLimiter.setMBPerSec(ByteSizeValue.ofBytes(ioRateLimitBytesPerSec).getMbFrac()); } - public boolean isRunning() { - return mergeStartTimeNS.get() > 0L; + /** + * Returns {@code true} if this task is currently running, or was run in the past. + * An aborted task (see {@link #abort()}) is considered as NOT run. + */ + public boolean hasStartedRunning() { + boolean isRunning = mergeStartTimeNS.get() > 0L; + assert isRunning != false || rateLimiter.getTotalBytesWritten() == 0L; + return isRunning; } /** @@ -349,7 +356,7 @@ public boolean isRunning() { */ @Override public void run() { - assert isRunning() == false; + assert hasStartedRunning() == false; assert ThreadPoolMergeScheduler.this.runningMergeTasks.containsKey(onGoingMerge.getMerge()) : "runNowOrBacklog must be invoked before actually running the merge task"; try { @@ -414,7 +421,7 @@ public void run() { * (by the {@link org.apache.lucene.index.IndexWriter}) to any subsequent merges. */ void abort() { - assert isRunning() == false; + assert hasStartedRunning() == false; assert ThreadPoolMergeScheduler.this.runningMergeTasks.containsKey(onGoingMerge.getMerge()) == false : "cannot abort a merge task that's already running"; if (verbose()) { @@ -443,10 +450,17 @@ void abort() { } } - long estimatedMergeSize() { + /** + * Before the merge task started running, this returns the estimated required disk space for the merge to complete + * (i.e. the estimated disk space size of the resulting segment following the merge). + * While the merge is running, the returned estimation is updated to take into account the data that's already been written. + * After the merge completes, the estimation returned here should ideally be close to "0". + */ + long estimatedRemainingMergeSize() { // TODO is it possible that `estimatedMergeBytes` be `0` for correctly initialize merges, // or is it always the case that if `estimatedMergeBytes` is `0` that means that the merge has not yet been initialized? - return onGoingMerge.getMerge().getStoreMergeInfo().estimatedMergeBytes(); + long estimatedMergeSize = onGoingMerge.getMerge().getStoreMergeInfo().estimatedMergeBytes(); + return Math.max(0L, estimatedMergeSize - rateLimiter.getTotalBytesWritten()); } @Override diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesService.java b/server/src/main/java/org/elasticsearch/indices/IndicesService.java index 64fa709512bdf..c5711341a8a56 100644 --- a/server/src/main/java/org/elasticsearch/indices/IndicesService.java +++ b/server/src/main/java/org/elasticsearch/indices/IndicesService.java @@ -289,10 +289,6 @@ protected void doStart() { IndicesService(IndicesServiceBuilder builder) { this.settings = builder.settings; this.threadPool = builder.threadPool; - this.threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( - threadPool, - settings - ); this.pluginsService = builder.pluginsService; this.nodeEnv = builder.nodeEnv; this.parserConfig = XContentParserConfiguration.EMPTY.withDeprecationHandler(LoggingDeprecationHandler.INSTANCE) @@ -315,6 +311,12 @@ protected void doStart() { this.bigArrays = builder.bigArrays; this.scriptService = builder.scriptService; this.clusterService = builder.clusterService; + this.threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + clusterService.getClusterSettings(), + nodeEnv + ); + this.projectResolver = builder.projectResolver; this.client = builder.client; this.featureService = builder.featureService; this.idFieldDataEnabled = INDICES_ID_FIELD_DATA_ENABLED_SETTING.get(clusterService.getSettings()); @@ -362,7 +364,8 @@ public void onRemoval(ShardId shardId, String fieldName, boolean wasEvicted, lon indicesFieldDataCache, cacheCleaner, indicesRequestCache, - indicesQueryCache + indicesQueryCache, + threadPoolMergeExecutorService ); } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java index cf1b05bc29630..62f95ce18bccb 100644 --- a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -192,13 +192,17 @@ public void setUp() throws Exception { emptyMap() ); threadPool = new TestThreadPool("test"); - threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); circuitBreakerService = new NoneCircuitBreakerService(); PageCacheRecycler pageCacheRecycler = new PageCacheRecycler(settings); bigArrays = new BigArrays(pageCacheRecycler, circuitBreakerService, CircuitBreaker.REQUEST); scriptService = new ScriptService(settings, Collections.emptyMap(), Collections.emptyMap(), () -> 1L); - clusterService = ClusterServiceUtils.createClusterService(threadPool); + clusterService = ClusterServiceUtils.createClusterService(threadPool, ClusterSettings.createBuiltInClusterSettings(settings)); nodeEnvironment = new NodeEnvironment(settings, environment); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + clusterService.getClusterSettings(), + nodeEnvironment + ); mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry(); indexNameExpressionResolver = TestIndexNameExpressionResolver.newInstance(threadPool.getThreadContext()); } diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java new file mode 100644 index 0000000000000..97943101758fe --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java @@ -0,0 +1,1023 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.tests.mockfile.FilterFileSystemProvider; +import org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.core.PathUtils; +import org.elasticsearch.core.PathUtilsForTesting; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.core.Tuple; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.IOException; +import java.nio.file.FileStore; +import java.nio.file.FileSystem; +import java.nio.file.Path; +import java.nio.file.attribute.FileAttributeView; +import java.nio.file.attribute.FileStoreAttributeView; +import java.nio.file.spi.FileSystemProvider; +import java.util.ArrayList; +import java.util.IdentityHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.is; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class ThreadPoolMergeExecutorServiceDiskSpaceTests extends ESTestCase { + + private static TestMockFileStore aFileStore = new TestMockFileStore("mocka"); + private static TestMockFileStore bFileStore = new TestMockFileStore("mockb"); + private static String aPathPart; + private static String bPathPart; + private static int mergeExecutorThreadCount; + private static Settings settings; + private static TestCapturingThreadPool testThreadPool; + private static NodeEnvironment nodeEnvironment; + + @BeforeClass + public static void installMockUsableSpaceFS() throws Exception { + FileSystem current = PathUtils.getDefaultFileSystem(); + aPathPart = "a-" + randomUUID(); + bPathPart = "b-" + randomUUID(); + FileSystemProvider mock = new TestMockUsableSpaceFileSystemProvider(current); + PathUtilsForTesting.installMock(mock.getFileSystem(null)); + Path path = PathUtils.get(createTempDir().toString()); + // use 2 data paths + String[] paths = new String[] { path.resolve(aPathPart).toString(), path.resolve(bPathPart).toString() }; + // some tests hold one merge thread blocked, and need at least one other runnable + mergeExecutorThreadCount = randomIntBetween(2, 9); + Settings.Builder settingsBuilder = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), path) + .putList(Environment.PATH_DATA_SETTING.getKey(), paths) + // the default of "5s" slows down testing + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "50ms") + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount); + if (randomBoolean()) { + settingsBuilder.put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true); + } + settings = settingsBuilder.build(); + testThreadPool = new TestCapturingThreadPool("test", settings); + nodeEnvironment = new NodeEnvironment(settings, TestEnvironment.newEnvironment(settings)); + } + + @AfterClass + public static void removeMockUsableSpaceFS() { + PathUtilsForTesting.teardown(); + aFileStore = null; + bFileStore = null; + testThreadPool.close(); + nodeEnvironment.close(); + } + + @After + public void cleanupThreadPool() { + testThreadPool.scheduledTasks.clear(); + } + + static class TestCapturingThreadPool extends TestThreadPool { + final List> scheduledTasks = new ArrayList<>(); + + TestCapturingThreadPool(String name, Settings settings) { + super(name, settings); + } + + @Override + public Cancellable scheduleWithFixedDelay(Runnable command, TimeValue interval, Executor executor) { + Cancellable cancellable = super.scheduleWithFixedDelay(command, interval, executor); + scheduledTasks.add(new Tuple<>(interval, cancellable)); + return cancellable; + } + } + + static class TestMockUsableSpaceFileSystemProvider extends FilterFileSystemProvider { + + TestMockUsableSpaceFileSystemProvider(FileSystem inner) { + super("mockusablespace://", inner); + } + + @Override + public FileStore getFileStore(Path path) { + if (path.toString().contains(path.getFileSystem().getSeparator() + aPathPart)) { + return aFileStore; + } else { + assert path.toString().contains(path.getFileSystem().getSeparator() + bPathPart); + return bFileStore; + } + } + } + + static class TestMockFileStore extends FileStore { + + public volatile long totalSpace; + public volatile long freeSpace; + public volatile long usableSpace; + public volatile boolean throwIoException; + + private final String desc; + + TestMockFileStore(String desc) { + this.desc = desc; + } + + @Override + public String type() { + return "mock"; + } + + @Override + public String name() { + return desc; + } + + @Override + public String toString() { + return desc; + } + + @Override + public boolean isReadOnly() { + return false; + } + + @Override + public long getTotalSpace() throws IOException { + if (throwIoException) { + throw new IOException("Test IO Exception"); + } + return totalSpace; + } + + @Override + public long getUnallocatedSpace() throws IOException { + if (throwIoException) { + throw new IOException("Test IO Exception"); + } + return freeSpace; + } + + @Override + public long getUsableSpace() throws IOException { + if (throwIoException) { + throw new IOException("Test IO Exception"); + } + return usableSpace; + } + + @Override + public boolean supportsFileAttributeView(Class type) { + return false; + } + + @Override + public boolean supportsFileAttributeView(String name) { + return false; + } + + @Override + public V getFileStoreAttributeView(Class type) { + return null; + } + + @Override + public Object getAttribute(String attribute) { + return null; + } + } + + public void testAvailableDiskSpaceMonitorWithDefaultSettings() throws Exception { + // path "a" has lots of free space, and "b" has little + aFileStore.usableSpace = 100_000L; + aFileStore.totalSpace = aFileStore.usableSpace * 2; + bFileStore.usableSpace = 1_000L; + bFileStore.totalSpace = bFileStore.usableSpace * 2; + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + ClusterSettings.createBuiltInClusterSettings(settings), + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(1)); + // 100_000 (available) - 5% (default flood stage level) * 200_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(90_000L)); + } + }); + // "b" now has more available space + bFileStore.usableSpace = 110_000L; + bFileStore.totalSpace = 130_000L; + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(2)); + // 110_000 (available) - 5% (default flood stage level) * 130_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(103_500L)); + } + }); + // available space for "a" and "b" is below the limit => it's clamp down to "0" + aFileStore.usableSpace = 100L; + bFileStore.usableSpace = 1_000L; + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(3)); + // 1_000 (available) - 5% (default flood stage level) * 130_000 (total space) < 0 + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(0L)); + } + }); + } + } + + public void testDiskSpaceMonitorStartsAsDisabled() throws Exception { + aFileStore.usableSpace = randomLongBetween(1L, 100L); + aFileStore.totalSpace = randomLongBetween(1L, 100L); + aFileStore.throwIoException = randomBoolean(); + bFileStore.usableSpace = randomLongBetween(1L, 100L); + bFileStore.totalSpace = randomLongBetween(1L, 100L); + bFileStore.throwIoException = randomBoolean(); + Settings.Builder settingsBuilder = Settings.builder().put(settings); + if (randomBoolean()) { + settingsBuilder.put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0"); + } else { + settingsBuilder.put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s"); + } + Settings settings = settingsBuilder.build(); + ClusterSettings clusterSettings = ClusterSettings.createBuiltInClusterSettings(settings); + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + clusterSettings, + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertThat(diskSpacePeriodicMonitor.isScheduled(), is(false)); + assertThat(availableDiskSpaceUpdates.size(), is(1)); + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(Long.MAX_VALUE)); + // updating monitoring interval should enable the monitor + String intervalSettingValue = randomFrom("1s", "123ms", "5nanos", "2h"); + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), intervalSettingValue) + .build() + ); + assertThat(diskSpacePeriodicMonitor.isScheduled(), is(true)); + assertThat(testThreadPool.scheduledTasks.size(), is(1)); + assertThat( + testThreadPool.scheduledTasks.getLast().v1(), + is( + TimeValue.parseTimeValue( + intervalSettingValue, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey() + ) + ) + ); + } + aFileStore.throwIoException = false; + bFileStore.throwIoException = false; + } + + public void testAvailableDiskSpaceMonitorWhenFileSystemStatErrors() throws Exception { + aFileStore.usableSpace = randomLongBetween(1L, 100L); + aFileStore.totalSpace = randomLongBetween(1L, 100L); + bFileStore.usableSpace = randomLongBetween(1L, 100L); + bFileStore.totalSpace = randomLongBetween(1L, 100L); + boolean aErrorsFirst = randomBoolean(); + if (aErrorsFirst) { + // the "a" file system will error when collecting stats + aFileStore.throwIoException = true; + bFileStore.throwIoException = false; + } else { + aFileStore.throwIoException = false; + bFileStore.throwIoException = true; + } + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + ClusterSettings.createBuiltInClusterSettings(settings), + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(1)); + if (aErrorsFirst) { + // uses the stats from "b" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(bFileStore.usableSpace - bFileStore.totalSpace / 20, 0L)) + ); + } else { + // uses the stats from "a" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(aFileStore.usableSpace - aFileStore.totalSpace / 20, 0L)) + ); + } + } + }); + if (aErrorsFirst) { + // the "b" file system will also now error when collecting stats + bFileStore.throwIoException = true; + } else { + // the "a" file system will also now error when collecting stats + aFileStore.throwIoException = true; + } + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(2)); + // consider the available disk space as unlimited when no fs stats can be collected + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(Long.MAX_VALUE)); + } + }); + if (aErrorsFirst) { + // "a" fs stats collection recovered + aFileStore.throwIoException = false; + } else { + // "b" fs stats collection recovered + bFileStore.throwIoException = false; + } + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(3)); + if (aErrorsFirst) { + // uses the stats from "a" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(aFileStore.usableSpace - aFileStore.totalSpace / 20, 0L)) + ); + } else { + // uses the stats from "b" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(bFileStore.usableSpace - bFileStore.totalSpace / 20, 0L)) + ); + } + } + }); + } + aFileStore.throwIoException = false; + bFileStore.throwIoException = false; + } + + public void testAvailableDiskSpaceMonitorSettingsUpdate() throws Exception { + ClusterSettings clusterSettings = ClusterSettings.createBuiltInClusterSettings(settings); + // path "b" has more usable (available) space, but path "a" has more total space + aFileStore.usableSpace = 900_000L; + aFileStore.totalSpace = 1_200_000L; + bFileStore.usableSpace = 1_000_000L; + bFileStore.totalSpace = 1_100_000L; + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + clusterSettings, + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(1)); + // 1_000_000 (available) - 5% (default flood stage level) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(945_000L)); + } + }, 5, TimeUnit.SECONDS); + // updated the ration for the watermark + clusterSettings.applySettings( + Settings.builder().put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "90%").build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(2)); + // 1_000_000 (available) - 10% (indices.merge.disk.watermark.high) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(890_000L)); + } + }, 5, TimeUnit.SECONDS); + // absolute value for the watermark limit + clusterSettings.applySettings( + Settings.builder().put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "3000b").build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(3)); + // 1_000_000 (available) - 3_000 (indices.merge.disk.watermark.high) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(997_000L)); + } + }, 5, TimeUnit.SECONDS); + // headroom value that takes priority over the watermark + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "50%") + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "11111b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(4)); + // 1_000_000 (available) - 11_111 (indices.merge.disk.watermark.high) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(988_889L)); + } + }, 5, TimeUnit.SECONDS); + // watermark limit that takes priority over the headroom + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "98%") + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "22222b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(5)); + // 1_000_000 (available) - 2% (indices.merge.disk.watermark.high) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(978_000L)); + } + }, 5, TimeUnit.SECONDS); + // headroom takes priority over the default watermark of 95% + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "22222b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(6)); + // 1_000_000 (available) - 22_222 + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(977_778L)); + } + }, 5, TimeUnit.SECONDS); + // watermark from routing allocation takes priority + clusterSettings.applySettings( + Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "99%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "2b") + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "22222b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(7)); + // 1_000_000 (available) - 1% (cluster.routing.allocation.disk.watermark.flood_stage) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(989_000L)); + } + }, 5, TimeUnit.SECONDS); + } + } + + public void testAbortingOrRunningMergeTaskHoldsUpBudget() throws Exception { + aFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + bFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + aFileStore.usableSpace = randomLongBetween(900L, aFileStore.totalSpace); + bFileStore.usableSpace = randomLongBetween(900L, bFileStore.totalSpace); + boolean aHasMoreSpace = aFileStore.usableSpace > bFileStore.usableSpace; + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), greaterThanOrEqualTo(1)); + // assumes the 5% default value for the remaining space watermark + final long availableInitialBudget = aHasMoreSpace + ? aFileStore.usableSpace - aFileStore.totalSpace / 20 + : bFileStore.usableSpace - bFileStore.totalSpace / 20; + final AtomicLong expectedAvailableBudget = new AtomicLong(availableInitialBudget); + // wait for the merge scheduler to learn about the available disk space + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + ThreadPoolMergeScheduler.MergeTask stallingMergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + when(stallingMergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(stallingMergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + CountDownLatch testDoneLatch = new CountDownLatch(1); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(stallingMergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(stallingMergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(stallingMergeTask); + // assert the merge task is holding up disk space budget + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + // double check that submitting a runnable merge task under budget works correctly + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(0L, expectedAvailableBudget.get())); + when(mergeTask.schedule()).thenReturn(RUN); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + assertBusy(() -> { + verify(mergeTask).schedule(); + verify(mergeTask).run(); + verify(mergeTask, times(0)).abort(); + }); + // let the test finish + testDoneLatch.countDown(); + assertBusy(() -> { + // available budget is back to the initial value + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(availableInitialBudget)); + if (stallingMergeTask.schedule() == RUN) { + verify(stallingMergeTask).run(); + verify(stallingMergeTask, times(0)).abort(); + } else { + verify(stallingMergeTask).abort(); + verify(stallingMergeTask, times(0)).run(); + } + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + }); + } + } + + public void testBackloggedMergeTasksDoNotHoldUpBudget() throws Exception { + aFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + bFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + aFileStore.usableSpace = randomLongBetween(900L, aFileStore.totalSpace); + bFileStore.usableSpace = randomLongBetween(900L, bFileStore.totalSpace); + boolean aHasMoreSpace = aFileStore.usableSpace > bFileStore.usableSpace; + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), greaterThanOrEqualTo(1)); + // assumes the 5% default value for the remaining space watermark + final long availableInitialBudget = aHasMoreSpace + ? aFileStore.usableSpace - aFileStore.totalSpace / 20 + : bFileStore.usableSpace - bFileStore.totalSpace / 20; + final AtomicLong expectedAvailableBudget = new AtomicLong(availableInitialBudget); + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + long backloggedMergeTaskDiskSpaceBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + CountDownLatch testDoneLatch = new CountDownLatch(1); + // take care that there's still at least one thread available to run merges + int maxBlockingTasksToSubmit = mergeExecutorThreadCount - 1; + // first maybe submit some running or aborting merge tasks that hold up some budget while running or aborting + List runningMergeTasks = new ArrayList<>(); + List abortingMergeTasks = new ArrayList<>(); + while (expectedAvailableBudget.get() - backloggedMergeTaskDiskSpaceBudget > 0L + && maxBlockingTasksToSubmit-- > 0 + && randomBoolean()) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get() - backloggedMergeTaskDiskSpaceBudget); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(mergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + // this task runs/aborts, and it's going to hold up some budget for it + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + // this task will hold up budget because it blocks when it runs (to simulate it running for a long time) + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (mergeTask.schedule() == RUN) { + runningMergeTasks.add(mergeTask); + } else { + abortingMergeTasks.add(mergeTask); + } + } + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + // submit some backlogging merge tasks which should NOT hold up any budget + IdentityHashMap backloggingMergeTasksScheduleCountMap = new IdentityHashMap<>(); + int backloggingTaskCount = randomIntBetween(1, 10); + while (backloggingTaskCount-- > 0) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, backloggedMergeTaskDiskSpaceBudget); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + doAnswer(mock -> { + // task always backlogs (as long as the test hasn't finished) + if (testDoneLatch.getCount() > 0) { + return BACKLOG; + } else { + return RUN; + } + }).when(mergeTask).schedule(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + backloggingMergeTasksScheduleCountMap.put(mergeTask, 1); + } + int checkRounds = randomIntBetween(1, 10); + // assert all backlogging merge tasks have been scheduled while possibly re-enqueued, + // BUT none run and none aborted, AND the available budget is left unchanged + while (true) { + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : backloggingMergeTasksScheduleCountMap.keySet()) { + verify(mergeTask, times(backloggingMergeTasksScheduleCountMap.get(mergeTask))).schedule(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : backloggingMergeTasksScheduleCountMap.keySet()) { + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(0)).abort(); + } + // budget hasn't changed! + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + if (checkRounds-- <= 0) { + break; + } + // maybe re-enqueue backlogged merge task + for (ThreadPoolMergeScheduler.MergeTask backlogged : backloggingMergeTasksScheduleCountMap.keySet()) { + if (randomBoolean()) { + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backlogged); + backloggingMergeTasksScheduleCountMap.put(backlogged, backloggingMergeTasksScheduleCountMap.get(backlogged) + 1); + } + } + // double check that submitting a runnable merge task under budget works correctly + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, backloggedMergeTaskDiskSpaceBudget); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(mergeTask.schedule()).thenReturn(RUN); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + assertBusy(() -> { + verify(mergeTask).schedule(); + verify(mergeTask).run(); + verify(mergeTask, times(0)).abort(); + }); + } + // let the test finish + testDoneLatch.countDown(); + for (ThreadPoolMergeScheduler.MergeTask backlogged : backloggingMergeTasksScheduleCountMap.keySet()) { + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backlogged); + } + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : runningMergeTasks) { + verify(mergeTask).run(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : abortingMergeTasks) { + verify(mergeTask).abort(); + } + for (ThreadPoolMergeScheduler.MergeTask backlogged : backloggingMergeTasksScheduleCountMap.keySet()) { + verify(backlogged).run(); + } + // available budget is restored + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(availableInitialBudget)); + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + }); + } + } + + public void testUnavailableBudgetBlocksNewMergeTasksFromStartingExecution() throws Exception { + aFileStore.totalSpace = 150_000L; + bFileStore.totalSpace = 140_000L; + boolean aHasMoreSpace = randomBoolean(); + if (aHasMoreSpace) { + // "a" has more available space + aFileStore.usableSpace = 120_000L; + bFileStore.usableSpace = 100_000L; + } else { + // "b" has more available space + aFileStore.usableSpace = 90_000L; + bFileStore.usableSpace = 110_000L; + } + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + // wait for the budget to be updated from the available disk space + AtomicLong expectedAvailableBudget = new AtomicLong(); + assertBusy(() -> { + if (aHasMoreSpace) { + // 120_000L (available) - 5% (default flood stage level) * 150_000L (total) + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(112_500L)); + expectedAvailableBudget.set(112_500L); + } else { + // 110_000L (available) - 5% (default flood stage level) * 140_000L (total) + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(103_000L)); + expectedAvailableBudget.set(103_000L); + } + }); + List runningOrAbortingMergeTasksList = new ArrayList<>(); + List latchesBlockingMergeTasksList = new ArrayList<>(); + int submittedMergesCount = randomIntBetween(1, mergeExecutorThreadCount - 1); + // submit merge tasks that don't finish, in order to deplete the available budget + while (submittedMergesCount > 0 && expectedAvailableBudget.get() > 0L) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + doAnswer(mock -> { + Schedule schedule = randomFrom(Schedule.values()); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // re-enqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + // let some task complete, which will NOT hold up any budget + if (randomBoolean()) { + // this task will NOT hold up any budget because it runs quickly (it is not blocked) + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(1_000L, 10_000L)); + } else { + CountDownLatch blockMergeTaskLatch = new CountDownLatch(1); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + submittedMergesCount--; + // this task will hold up budget because it blocks when it runs (to simulate it running for a long time) + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + blockMergeTaskLatch.await(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + blockMergeTaskLatch.await(); + return null; + }).when(mergeTask).abort(); + runningOrAbortingMergeTasksList.add(mergeTask); + latchesBlockingMergeTasksList.add(blockMergeTaskLatch); + } + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + // currently running (or aborting) merge tasks have consumed some of the available budget + while (runningOrAbortingMergeTasksList.isEmpty() == false) { + assertBusy( + () -> assertThat( + threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), + is(expectedAvailableBudget.get()) + ) + ); + ThreadPoolMergeScheduler.MergeTask mergeTask1 = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask1.supportsIOThrottling()).thenReturn(randomBoolean()); + when(mergeTask1.schedule()).thenReturn(RUN); + ThreadPoolMergeScheduler.MergeTask mergeTask2 = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask2.supportsIOThrottling()).thenReturn(randomBoolean()); + when(mergeTask2.schedule()).thenReturn(RUN); + boolean task1Runs = randomBoolean(); + long currentAvailableBudget = expectedAvailableBudget.get(); + long overBudget = randomLongBetween(currentAvailableBudget + 1L, currentAvailableBudget + 100L); + long underBudget = randomLongBetween(0L, currentAvailableBudget); + if (task1Runs) { + // merge task 1 can run because it is under budget + when(mergeTask1.estimatedRemainingMergeSize()).thenReturn(underBudget); + // merge task 2 cannot run because it is over budget + when(mergeTask2.estimatedRemainingMergeSize()).thenReturn(overBudget); + } else { + // merge task 1 cannot run because it is over budget + when(mergeTask1.estimatedRemainingMergeSize()).thenReturn(overBudget); + // merge task 2 can run because it is under budget + when(mergeTask2.estimatedRemainingMergeSize()).thenReturn(underBudget); + } + threadPoolMergeExecutorService.submitMergeTask(mergeTask1); + threadPoolMergeExecutorService.submitMergeTask(mergeTask2); + assertBusy(() -> { + if (task1Runs) { + verify(mergeTask1).schedule(); + verify(mergeTask1).run(); + verify(mergeTask2, times(0)).schedule(); + verify(mergeTask2, times(0)).run(); + } else { + verify(mergeTask2).schedule(); + verify(mergeTask2).run(); + verify(mergeTask1, times(0)).schedule(); + verify(mergeTask1, times(0)).run(); + } + }); + // let one task finish from the bunch that is holding up budget + int index = randomIntBetween(0, runningOrAbortingMergeTasksList.size() - 1); + latchesBlockingMergeTasksList.remove(index).countDown(); + ThreadPoolMergeScheduler.MergeTask completedMergeTask = runningOrAbortingMergeTasksList.remove(index); + // update the expected budget given that one task now finished + expectedAvailableBudget.set(expectedAvailableBudget.get() + completedMergeTask.estimatedRemainingMergeSize()); + } + // let the test finish cleanly + assertBusy(() -> { + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(aHasMoreSpace ? 112_500L : 103_000L)); + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + }); + } + } + + public void testMergeTasksAreUnblockedWhenMoreDiskSpaceBecomesAvailable() throws Exception { + aFileStore.totalSpace = randomLongBetween(300L, 1_000L); + bFileStore.totalSpace = randomLongBetween(300L, 1_000L); + long grantedUsableSpaceBuffer = randomLongBetween(10L, 50L); + aFileStore.usableSpace = randomLongBetween(200L, aFileStore.totalSpace - grantedUsableSpaceBuffer); + bFileStore.usableSpace = randomLongBetween(200L, bFileStore.totalSpace - grantedUsableSpaceBuffer); + boolean aHasMoreSpace = aFileStore.usableSpace > bFileStore.usableSpace; + Settings.Builder settingsBuilder = Settings.builder().put(settings); + // change the watermark level, just for coverage and it's easier with the calculations + if (randomBoolean()) { + settingsBuilder.put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "90%"); + } else { + settingsBuilder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "90%"); + } + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settingsBuilder.build()), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), greaterThanOrEqualTo(1)); + // uses the 10% watermark limit + final long availableInitialBudget = aHasMoreSpace + ? aFileStore.usableSpace - aFileStore.totalSpace / 10 + : bFileStore.usableSpace - bFileStore.totalSpace / 10; + final AtomicLong expectedAvailableBudget = new AtomicLong(availableInitialBudget); + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + // maybe let some merge tasks hold up some budget + // take care that there's still at least one thread available to run merges + int maxBlockingTasksToSubmit = mergeExecutorThreadCount - 1; + // first maybe submit some running or aborting merge tasks that hold up some budget while running or aborting + List runningMergeTasks = new ArrayList<>(); + List abortingMergeTasks = new ArrayList<>(); + CountDownLatch testDoneLatch = new CountDownLatch(1); + while (expectedAvailableBudget.get() > 0L && maxBlockingTasksToSubmit-- > 0 && randomBoolean()) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(mergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + // this task runs/aborts, and it's going to hold up some budget for it + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + // this task will hold up budget because it blocks when it runs (to simulate it running for a long time) + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (mergeTask.schedule() == RUN) { + runningMergeTasks.add(mergeTask); + } else { + abortingMergeTasks.add(mergeTask); + } + } + assertBusy(() -> { + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + // send some runnable merge tasks that although runnable are currently over budget + int overBudgetTaskCount = randomIntBetween(1, 5); + List overBudgetTasksToRunList = new ArrayList<>(); + List overBudgetTasksToAbortList = new ArrayList<>(); + while (overBudgetTaskCount-- > 0) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + // currently over-budget + long taskBudget = randomLongBetween( + expectedAvailableBudget.get() + 1L, + expectedAvailableBudget.get() + grantedUsableSpaceBuffer + ); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + Schedule schedule = randomFrom(RUN, ABORT); + when(mergeTask.schedule()).thenReturn(schedule); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (schedule == RUN) { + overBudgetTasksToRunList.add(mergeTask); + } else { + overBudgetTasksToAbortList.add(mergeTask); + } + } + // over-budget tasks did not run, are enqueued, and budget is unchanged + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToAbortList) { + verify(mergeTask, times(0)).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(0)).abort(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToRunList) { + verify(mergeTask, times(0)).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(0)).abort(); + } + assertThat( + threadPoolMergeExecutorService.getMergeTasksQueueLength(), + is(overBudgetTasksToAbortList.size() + overBudgetTasksToRunList.size()) + ); + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + // more disk space becomes available + if (aHasMoreSpace) { + aFileStore.usableSpace += grantedUsableSpaceBuffer; + } else { + bFileStore.usableSpace += grantedUsableSpaceBuffer; + } + expectedAvailableBudget.set(expectedAvailableBudget.get() + grantedUsableSpaceBuffer); + // all over-budget tasks can now run because more disk space became available + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToRunList) { + verify(mergeTask).schedule(); + verify(mergeTask).run(); + verify(mergeTask, times(0)).abort(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToAbortList) { + verify(mergeTask).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask).abort(); + } + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + // let test finish cleanly + testDoneLatch.countDown(); + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : runningMergeTasks) { + verify(mergeTask).run(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : abortingMergeTasks) { + verify(mergeTask).abort(); + } + assertThat( + threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), + is(availableInitialBudget + grantedUsableSpaceBuffer) + ); + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + assertThat( + threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), + is(availableInitialBudget + grantedUsableSpaceBuffer) + ); + }); + } + } +} diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java index 8ce1645148337..aaffd697189b0 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java @@ -9,20 +9,27 @@ package org.elasticsearch.index.engine; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.MergeTaskPriorityBlockingQueue; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.PriorityBlockingQueueWithBudget; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.threadpool.TestThreadPool; import org.elasticsearch.threadpool.ThreadPool; +import org.junit.After; import org.mockito.ArgumentCaptor; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; +import java.util.IdentityHashMap; import java.util.List; import java.util.PriorityQueue; import java.util.Set; @@ -42,6 +49,7 @@ import static org.hamcrest.Matchers.either; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.lessThan; import static org.hamcrest.Matchers.lessThanOrEqualTo; @@ -55,9 +63,24 @@ public class ThreadPoolMergeExecutorServiceTests extends ESTestCase { - public void testNewMergeTaskIsAbortedWhenThreadPoolIsShutdown() { - TestThreadPool testThreadPool = new TestThreadPool("test"); - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + private NodeEnvironment nodeEnvironment; + + @After + public void closeNodeEnv() { + if (nodeEnvironment != null) { + nodeEnvironment.close(); + nodeEnvironment = null; + } + } + + public void testNewMergeTaskIsAbortedWhenThreadPoolIsShutdown() throws IOException { + TestThreadPool testThreadPool = new TestThreadPool("test", Settings.EMPTY); + nodeEnvironment = newNodeEnvironment(Settings.EMPTY); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + Settings.EMPTY, + nodeEnvironment + ); // shutdown the thread pool testThreadPool.shutdown(); MergeTask mergeTask = mock(MergeTask.class); @@ -77,9 +100,18 @@ public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutd Settings settings = Settings.builder() .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); TestThreadPool testThreadPool = new TestThreadPool("test", settings); - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + var countingListener = new CountingMergeEventListener(); + threadPoolMergeExecutorService.registerMergeEventListener(countingListener); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); Semaphore runMergeSemaphore = new Semaphore(0); ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); @@ -174,9 +206,16 @@ public void testTargetIORateChangesWhenSubmittingMergeTasks() throws Exception { Settings settings = Settings.builder() .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); Semaphore runMergeSemaphore = new Semaphore(0); AtomicInteger submittedIOThrottledMergeTasks = new AtomicInteger(); @@ -254,9 +293,16 @@ public void testIORateIsAdjustedForRunningMergeTasks() throws Exception { Settings settings = Settings.builder() .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); Semaphore runMergeSemaphore = new Semaphore(0); @@ -318,7 +364,7 @@ public void testIORateIsAdjustedForRunningMergeTasks() throws Exception { } } - public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSpeedy() { + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSpeedy() throws IOException { // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted int submittedVsExecutedRateOutOf1000 = randomIntBetween(0, 250); testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); @@ -326,7 +372,7 @@ public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSpeedy() { testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); } - public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSluggish() { + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSluggish() throws IOException { // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted int submittedVsExecutedRateOutOf1000 = randomIntBetween(750, 1000); testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); @@ -334,7 +380,7 @@ public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSluggish() { testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); } - public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsOnPar() { + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsOnPar() throws IOException { // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted int submittedVsExecutedRateOutOf1000 = randomIntBetween(250, 750); testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); @@ -342,14 +388,24 @@ public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsOnPar() { testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); } - private void testIORateAdjustedForSubmittedTasks( - int totalTasksToSubmit, - int submittedVsExecutedRateOutOf1000, - int initialTasksToSubmit - ) { + private void testIORateAdjustedForSubmittedTasks(int totalTasksToSubmit, int submittedVsExecutedRateOutOf1000, int initialTasksToSubmit) + throws IOException { DeterministicTaskQueue mergeExecutorTaskQueue = new DeterministicTaskQueue(); ThreadPool mergeExecutorThreadPool = mergeExecutorTaskQueue.getThreadPool(); - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(mergeExecutorThreadPool); + Settings settings = Settings.builder() + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + if (nodeEnvironment != null) { + nodeEnvironment.close(); + nodeEnvironment = null; + } + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + mergeExecutorThreadPool, + settings, + nodeEnvironment + ); final AtomicInteger currentlySubmittedMergeTaskCount = new AtomicInteger(); final AtomicLong targetIORateLimit = new AtomicLong(ThreadPoolMergeExecutorService.START_IO_RATE.getBytes()); final AtomicReference lastRunTask = new AtomicReference<>(); @@ -407,9 +463,16 @@ public void testMergeTasksRunConcurrently() throws Exception { Settings settings = Settings.builder() .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); // more merge tasks than max concurrent merges allowed to run concurrently int totalMergeTasksCount = mergeExecutorThreadCount + randomIntBetween(1, 5); @@ -450,7 +513,7 @@ public void testMergeTasksRunConcurrently() throws Exception { assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); // with the other merge tasks enqueued assertThat( - threadPoolMergeExecutorService.getQueuedMergeTasks().size(), + threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(totalMergeTasksCount - mergeExecutorThreadCount - finalCompletedTasksCount) ); // also check thread-pool stats for the same @@ -470,7 +533,7 @@ public void testMergeTasksRunConcurrently() throws Exception { // there are fewer available merges than available threads assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(finalRemainingMergeTasksCount)); // no more merges enqueued - assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); // also check thread-pool stats for the same assertThat(threadPoolExecutor.getActiveCount(), is(finalRemainingMergeTasksCount)); assertThat(threadPoolExecutor.getQueue().size(), is(0)); @@ -487,9 +550,16 @@ public void testThreadPoolStatsWithBackloggedMergeTasks() throws Exception { Settings settings = Settings.builder() .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); int totalMergeTasksCount = randomIntBetween(1, 10); ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); @@ -518,7 +588,7 @@ public void testThreadPoolStatsWithBackloggedMergeTasks() throws Exception { assertThat(threadPoolExecutor.getActiveCount(), is(backloggedMergeTasksList.size())); assertThat(threadPoolExecutor.getQueue().size(), is(0)); } - assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); }); // re-enqueue backlogged merge tasks for (MergeTask backloggedMergeTask : backloggedMergeTasksList) { @@ -540,9 +610,16 @@ public void testBackloggedMergeTasksExecuteExactlyOnce() throws Exception { .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) // few merge threads, in order to increase contention .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(testThreadPool); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); // many merge tasks concurrently int mergeTaskCount = randomIntBetween(10, 100); @@ -598,22 +675,31 @@ public void testBackloggedMergeTasksExecuteExactlyOnce() throws Exception { } } - public void testMergeTasksExecuteInSizeOrder() { + public void testMergeTasksExecuteInSizeOrder() throws IOException { DeterministicTaskQueue mergeExecutorTaskQueue = new DeterministicTaskQueue(); ThreadPool mergeExecutorThreadPool = mergeExecutorTaskQueue.getThreadPool(); - ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService(mergeExecutorThreadPool); + Settings settings = Settings.builder() + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + mergeExecutorThreadPool, + settings, + nodeEnvironment + ); DeterministicTaskQueue reEnqueueBackloggedTaskQueue = new DeterministicTaskQueue(); int mergeTaskCount = randomIntBetween(10, 100); // sort merge tasks available to run by size PriorityQueue mergeTasksAvailableToRun = new PriorityQueue<>( mergeTaskCount, - Comparator.comparingLong(MergeTask::estimatedMergeSize) + Comparator.comparingLong(MergeTask::estimatedRemainingMergeSize) ); for (int i = 0; i < mergeTaskCount; i++) { MergeTask mergeTask = mock(MergeTask.class); when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); // merge tasks of various sizes (0 might be a valid value) - when(mergeTask.estimatedMergeSize()).thenReturn(randomLongBetween(0, 10)); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(0, 10)); doAnswer(mock -> { // each individual merge task can either "run" or be "backlogged" at any point in time Schedule schedule = randomFrom(Schedule.values()); @@ -635,7 +721,10 @@ public void testMergeTasksExecuteInSizeOrder() { } if (schedule == RUN && mergeTasksAvailableToRun.isEmpty() == false) { // assert the merge task that's now going to run is the smallest of the ones currently available to run - assertThat(mergeTask.estimatedMergeSize(), lessThanOrEqualTo(mergeTasksAvailableToRun.peek().estimatedMergeSize())); + assertThat( + mergeTask.estimatedRemainingMergeSize(), + lessThanOrEqualTo(mergeTasksAvailableToRun.peek().estimatedRemainingMergeSize()) + ); } return schedule; }).when(mergeTask).schedule(); @@ -660,14 +749,151 @@ public void testMergeTasksExecuteInSizeOrder() { } } - static ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService(ThreadPool threadPool) { + public void testMergeTaskQueueAvailableBudgetTracking() throws Exception { + MergeTaskPriorityBlockingQueue mergeTaskPriorityBlockingQueue = new MergeTaskPriorityBlockingQueue(); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(0L)); + long availableBudget = randomLongBetween(1, 10); + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + int taskCount = randomIntBetween(5, 15); + for (int i = 0; i < taskCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(1, 10)); + mergeTaskPriorityBlockingQueue.enqueue(mergeTask); + } + assertThat(mergeTaskPriorityBlockingQueue.queueSize(), is(taskCount)); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + List.ElementWithReleasableBudget> tookElements = new ArrayList<>(); + + while (mergeTaskPriorityBlockingQueue.isQueueEmpty() == false) { + if (mergeTaskPriorityBlockingQueue.peekQueue().estimatedRemainingMergeSize() <= mergeTaskPriorityBlockingQueue + .getAvailableBudget() && randomBoolean()) { + // take another element (merge task) from the queue + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + tookElements.add(mergeTaskPriorityBlockingQueue.take()); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, greaterThanOrEqualTo(0L)); + assertThat(prevBudget - afterBudget, is(tookElements.getLast().element().estimatedRemainingMergeSize())); + } else if (tookElements.stream().anyMatch(e -> e.isClosed() == false) && randomBoolean()) { + // "closes" a previously took element to simulate it has gone out of scope + int index = randomValueOtherThanMany( + i -> tookElements.get(i).isClosed(), + () -> randomIntBetween(0, tookElements.size() - 1) + ); + var elementToClose = tookElements.remove(index); + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + elementToClose.close(); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + // budget hasn't yet changed, the update budget method needs to be invoked before it does + assertThat(afterBudget, is(prevBudget)); + } else if (randomBoolean()) { + // update (possibly increment) the available budget + long budgetIncrement = randomLongBetween(0, 3); + availableBudget += budgetIncrement; + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + // "closed" took elements should not impact budget computation + tookElements.removeIf(PriorityBlockingQueueWithBudget.ElementWithReleasableBudget::isClosed); + long expectedBudget = availableBudget - tookElements.stream() + .mapToLong(e -> e.element().estimatedRemainingMergeSize()) + .sum(); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, is(expectedBudget)); + } + } + } + + public void testMergeTaskQueueBudgetTrackingWhenEstimatedRemainingMergeSizeChanges() throws Exception { + MergeTaskPriorityBlockingQueue mergeTaskPriorityBlockingQueue = new MergeTaskPriorityBlockingQueue(); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(0L)); + // plenty of available budget (this should be fixed for this test) + final long availableBudget = randomLongBetween(1000L, 2000L); + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + IdentityHashMap budgetMap = new IdentityHashMap<>(); + int taskCount = randomIntBetween(5, 15); + for (int i = 0; i < taskCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + budgetMap.put(mergeTask, randomLongBetween(1L, 10L)); + doAnswer(invocation -> budgetMap.get((MergeTask) invocation.getMock())).when(mergeTask).estimatedRemainingMergeSize(); + mergeTaskPriorityBlockingQueue.enqueue(mergeTask); + } + assertThat(mergeTaskPriorityBlockingQueue.queueSize(), is(taskCount)); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + List.ElementWithReleasableBudget> tookElements = new ArrayList<>(); + + while (mergeTaskPriorityBlockingQueue.isQueueEmpty() == false) { + if (tookElements.stream().allMatch(PriorityBlockingQueueWithBudget.ElementWithReleasableBudget::isClosed) || randomBoolean()) { + // take another element (merge task) from the queue + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + tookElements.add(mergeTaskPriorityBlockingQueue.take()); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, greaterThanOrEqualTo(0L)); + assertThat(prevBudget - afterBudget, is(tookElements.getLast().element().estimatedRemainingMergeSize())); + } else if (randomBoolean()) { + // "closes" a previously took element to simulate it has gone out of scope + int index = randomValueOtherThanMany( + i -> tookElements.get(i).isClosed(), + () -> randomIntBetween(0, tookElements.size() - 1) + ); + var elementToClose = tookElements.remove(index); + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + elementToClose.close(); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + // budget hasn't yet changed, the update budget method needs to be invoked before it does + assertThat(afterBudget, is(prevBudget)); + } else { + // update the remaining merge size of a took (but not "closed") merge task + int index = randomValueOtherThanMany( + i -> tookElements.get(i).isClosed(), + () -> randomIntBetween(0, tookElements.size() - 1) + ); + var elementToUpdate = tookElements.get(index); + long prevElementBudget = elementToUpdate.element().estimatedRemainingMergeSize(); + long afterElementBudget = randomValueOtherThan(prevElementBudget, () -> randomLongBetween(1L, 10L)); + budgetMap.put(elementToUpdate.element(), afterElementBudget); + assertThat(elementToUpdate.element().estimatedRemainingMergeSize(), is(afterElementBudget)); + // "closed" took elements should not impact budget computation + tookElements.removeIf(PriorityBlockingQueueWithBudget.ElementWithReleasableBudget::isClosed); + long expectedBudget = availableBudget - tookElements.stream().mapToLong(e -> budgetMap.get(e.element())).sum(); + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, is(expectedBudget)); + } + } + } + + private static class CountingMergeEventListener implements MergeEventListener { + AtomicInteger queued = new AtomicInteger(); + AtomicInteger aborted = new AtomicInteger(); + AtomicInteger completed = new AtomicInteger(); + + @Override + public void onMergeQueued(OnGoingMerge merge, long estimateMergeMemoryBytes) { + queued.incrementAndGet(); + } + + @Override + public void onMergeCompleted(OnGoingMerge merge) { + completed.incrementAndGet(); + } + + @Override + public void onMergeAborted(OnGoingMerge merge) { + aborted.incrementAndGet(); + } + } + + static ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService( + ThreadPool threadPool, + Settings settings, + NodeEnvironment nodeEnvironment + ) { ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService - .maybeCreateThreadPoolMergeExecutorService( - threadPool, - randomBoolean() - ? Settings.EMPTY - : Settings.builder().put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true).build() - ); + .maybeCreateThreadPoolMergeExecutorService(threadPool, ClusterSettings.createBuiltInClusterSettings(settings), nodeEnvironment); assertNotNull(threadPoolMergeExecutorService); assertTrue(threadPoolMergeExecutorService.allDone()); return threadPoolMergeExecutorService; diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java index d407e865efbaf..d80b53f441884 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -17,6 +17,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.MergeSchedulerConfig; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; @@ -26,6 +27,7 @@ import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.threadpool.TestThreadPool; import org.elasticsearch.threadpool.ThreadPool; +import org.junit.After; import org.mockito.ArgumentCaptor; import java.io.IOException; @@ -53,10 +55,25 @@ public class ThreadPoolMergeSchedulerTests extends ESTestCase { + private NodeEnvironment nodeEnvironment; + + @After + public void closeNodeEnv() { + if (nodeEnvironment != null) { + nodeEnvironment.close(); + nodeEnvironment = null; + } + } + public void testMergesExecuteInSizeOrder() throws IOException { DeterministicTaskQueue threadPoolTaskQueue = new DeterministicTaskQueue(); + Settings settings = Settings.builder() + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests - .getThreadPoolMergeExecutorService(threadPoolTaskQueue.getThreadPool()); + .getThreadPoolMergeExecutorService(threadPoolTaskQueue.getThreadPool(), settings, nodeEnvironment); try ( ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), @@ -139,7 +156,10 @@ public void testSimpleMergeTaskReEnqueueingBySize() { threadPoolMergeExecutorService ); // sort backlogged merges by size - PriorityQueue backloggedMergeTasks = new PriorityQueue<>(16, Comparator.comparingLong(MergeTask::estimatedMergeSize)); + PriorityQueue backloggedMergeTasks = new PriorityQueue<>( + 16, + Comparator.comparingLong(MergeTask::estimatedRemainingMergeSize) + ); // more merge tasks than merge threads int mergeCount = mergeExecutorThreadCount + randomIntBetween(2, 10); for (int i = 0; i < mergeCount; i++) { @@ -338,10 +358,13 @@ public void testMergeSourceWithFollowUpMergesRunSequentially() throws Exception Settings settings = Settings.builder() .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests - .getThreadPoolMergeExecutorService(testThreadPool); + .getThreadPoolMergeExecutorService(testThreadPool, settings, nodeEnvironment); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); try ( ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( @@ -410,10 +433,13 @@ public void testMergesRunConcurrently() throws Exception { Settings settings = Settings.builder() .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeSchedulerMaxThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests - .getThreadPoolMergeExecutorService(testThreadPool); + .getThreadPoolMergeExecutorService(testThreadPool, settings, nodeEnvironment); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); try ( @@ -455,7 +481,7 @@ public void testMergesRunConcurrently() throws Exception { // also check the same for the thread-pool executor assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(mergeSchedulerMaxThreadCount)); // queued merge tasks do not include backlogged merges - assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); // also check thread-pool stats for the same // there are active thread-pool threads waiting for the backlogged merge tasks to be re-enqueued int activeMergeThreads = Math.min(mergeCount - finalCompletedMergesCount, mergeExecutorThreadCount); @@ -476,7 +502,7 @@ public void testMergesRunConcurrently() throws Exception { // also check thread-pool executor for the same assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(finalRemainingMergesCount)); // no more backlogged merges - assertThat(threadPoolMergeExecutorService.getQueuedMergeTasks().size(), is(0)); + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); // also check thread-pool stats for the same assertThat(threadPoolExecutor.getActiveCount(), is(finalRemainingMergesCount)); assertThat(threadPoolExecutor.getQueue().size(), is(0)); @@ -495,10 +521,13 @@ public void testSchedulerCloseWaitsForRunningMerge() throws Exception { Settings settings = Settings.builder() .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeSchedulerMaxThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") .build(); + nodeEnvironment = newNodeEnvironment(settings); try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests - .getThreadPoolMergeExecutorService(testThreadPool); + .getThreadPoolMergeExecutorService(testThreadPool, settings, nodeEnvironment); assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); try ( ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( diff --git a/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java b/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java index 4e280f5443787..843c7f3f58852 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java @@ -24,6 +24,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.metrics.MeanMetric; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.concurrent.EsExecutors; @@ -33,6 +34,7 @@ import org.elasticsearch.core.Releasable; import org.elasticsearch.core.Strings; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; @@ -91,6 +93,7 @@ public class RefreshListenersTests extends ESTestCase { private Engine engine; private volatile int maxListeners; private ThreadPool threadPool; + private NodeEnvironment nodeEnvironment; private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private Store store; @@ -104,7 +107,12 @@ public void setupListeners() throws Exception { .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) .build(); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings); - threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); + nodeEnvironment = newNodeEnvironment(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ); listeners = new RefreshListeners( () -> maxListeners, () -> engine.refresh("too-many-listeners"), @@ -177,8 +185,7 @@ public void onFailedEngine(String reason, @Nullable Exception e) { @After public void tearDownListeners() throws Exception { - IOUtils.close(engine, store); - terminate(threadPool); + IOUtils.close(engine, store, nodeEnvironment, () -> terminate(threadPool)); } public void testBeforeRefresh() throws Exception { diff --git a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java index d4554df1617ee..574d3ac47daa9 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java @@ -61,6 +61,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.uid.Versions; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.util.BigArrays; @@ -68,6 +69,7 @@ import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.Nullable; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; @@ -155,6 +157,7 @@ public abstract class EngineTestCase extends ESTestCase { protected static final IndexSettings INDEX_SETTINGS = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY); protected ThreadPool threadPool; + protected NodeEnvironment nodeEnvironment; protected ThreadPoolMergeExecutorService threadPoolMergeExecutorService; protected TranslogHandler translogHandler; @@ -243,9 +246,11 @@ public void setUp() throws Exception { } defaultSettings = IndexSettingsModule.newIndexSettings("index", indexSettings()); threadPool = new TestThreadPool(getClass().getName()); + nodeEnvironment = newNodeEnvironment(defaultSettings.getNodeSettings()); threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( threadPool, - defaultSettings.getNodeSettings() + ClusterSettings.createBuiltInClusterSettings(defaultSettings.getNodeSettings()), + nodeEnvironment ); store = createStore(); @@ -394,7 +399,7 @@ public void tearDown() throws Exception { assertAtMostOneLuceneDocumentPerSequenceNumber(replicaEngine); } } finally { - IOUtils.close(replicaEngine, storeReplica, engine, store, () -> terminate(threadPool)); + IOUtils.close(replicaEngine, storeReplica, engine, store, () -> terminate(threadPool), nodeEnvironment); } } diff --git a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java index e8286835e9cfa..65c3e01cd6c87 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java @@ -154,6 +154,7 @@ public void onRecoveryFailure(RecoveryFailedException e, boolean sendShardFailur }; protected ThreadPool threadPool; + protected NodeEnvironment nodeEnvironment; protected ThreadPoolMergeExecutorService threadPoolMergeExecutorService; protected Executor writeExecutor; protected long primaryTerm; @@ -171,7 +172,12 @@ public void setUp() throws Exception { super.setUp(); Settings settings = threadPoolSettings(); threadPool = setUpThreadPool(settings); - threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); + nodeEnvironment = newNodeEnvironment(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ); writeExecutor = threadPool.executor(ThreadPool.Names.WRITE); primaryTerm = randomIntBetween(1, 100); // use random but fixed term for creating shards failOnShardFailures(); @@ -184,7 +190,7 @@ protected ThreadPool setUpThreadPool(Settings settings) { @Override public void tearDown() throws Exception { try { - tearDownThreadPool(); + IOUtils.close(nodeEnvironment, this::tearDownThreadPool); } finally { super.tearDown(); } diff --git a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java index 957570918cde3..a2cb7b85e880f 100644 --- a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java +++ b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java @@ -18,10 +18,13 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexModule; @@ -84,6 +87,7 @@ public class FollowingEngineTests extends ESTestCase { private ThreadPool threadPool; + private NodeEnvironment nodeEnvironment; private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private Index index; private ShardId shardId; @@ -98,7 +102,12 @@ public void setUp() throws Exception { .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) .build(); threadPool = new TestThreadPool("following-engine-tests", settings); - threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService(threadPool, settings); + nodeEnvironment = newNodeEnvironment(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ); index = new Index("index", "uuid"); shardId = new ShardId(index, 0); primaryTerm.set(randomLongBetween(1, Long.MAX_VALUE)); @@ -107,7 +116,7 @@ public void setUp() throws Exception { @Override public void tearDown() throws Exception { - terminate(threadPool); + IOUtils.close(nodeEnvironment, () -> terminate(threadPool)); super.tearDown(); } From 771c2943cecd0f5e43301e122a5f2668cff67e77 Mon Sep 17 00:00:00 2001 From: Pooya Salehi Date: Thu, 24 Apr 2025 11:49:06 +0200 Subject: [PATCH 10/14] Expose merge events and their memory usage estimate (#126667) Relates ES-10961 --- .../index/engine/InternalEngine.java | 2 +- .../index/engine/MergeEventListener.java | 26 +++++++++++++++ .../engine/MergeMemoryEstimateProvider.java | 21 ++++++++++++ .../ThreadPoolMergeExecutorService.java | 11 ++++++- .../engine/ThreadPoolMergeScheduler.java | 29 ++++++++++++++--- .../ThreadPoolMergeExecutorServiceTests.java | 13 ++++++++ .../engine/ThreadPoolMergeSchedulerTests.java | 32 ++++++++++++------- 7 files changed, 117 insertions(+), 17 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/engine/MergeEventListener.java create mode 100644 server/src/main/java/org/elasticsearch/index/engine/MergeMemoryEstimateProvider.java diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index 27448a0b2b4a2..aefd43466d617 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -2870,7 +2870,7 @@ private final class EngineThreadPoolMergeScheduler extends ThreadPoolMergeSchedu IndexSettings indexSettings, ThreadPoolMergeExecutorService threadPoolMergeExecutorService ) { - super(shardId, indexSettings, threadPoolMergeExecutorService); + super(shardId, indexSettings, threadPoolMergeExecutorService, InternalEngine.this::estimateMergeBytes); } @Override diff --git a/server/src/main/java/org/elasticsearch/index/engine/MergeEventListener.java b/server/src/main/java/org/elasticsearch/index/engine/MergeEventListener.java new file mode 100644 index 0000000000000..f029820535dc1 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/engine/MergeEventListener.java @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.elasticsearch.index.merge.OnGoingMerge; + +public interface MergeEventListener { + + /** + * + * @param merge + * @param estimateMergeMemoryBytes estimate of the memory needed to perform a merge + */ + void onMergeQueued(OnGoingMerge merge, long estimateMergeMemoryBytes); + + void onMergeCompleted(OnGoingMerge merge); + + void onMergeAborted(OnGoingMerge merge); +} diff --git a/server/src/main/java/org/elasticsearch/index/engine/MergeMemoryEstimateProvider.java b/server/src/main/java/org/elasticsearch/index/engine/MergeMemoryEstimateProvider.java new file mode 100644 index 0000000000000..10d0aa38adf3a --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/engine/MergeMemoryEstimateProvider.java @@ -0,0 +1,21 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.index.MergePolicy; + +@FunctionalInterface +public interface MergeMemoryEstimateProvider { + + /** + * Returns an estimate of the memory needed to perform a merge + */ + long estimateMergeMemoryBytes(MergePolicy.OneMerge merge); +} diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java index 32abd2dd8ada2..9e74c19d8a85e 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java @@ -36,6 +36,7 @@ import java.util.Map; import java.util.PriorityQueue; import java.util.Set; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ExecutorService; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.atomic.AtomicInteger; @@ -199,6 +200,8 @@ public Iterator> settings() { private final int concurrentMergesCeilLimitForThrottling; private final AvailableDiskSpacePeriodicMonitor availableDiskSpacePeriodicMonitor; + private final List mergeEventListeners = new CopyOnWriteArrayList<>(); + public static @Nullable ThreadPoolMergeExecutorService maybeCreateThreadPoolMergeExecutorService( ThreadPool threadPool, ClusterSettings clusterSettings, @@ -266,7 +269,7 @@ boolean submitMergeTask(MergeTask mergeTask) { ); } // then enqueue the merge task proper - queuedMergeTasks.add(mergeTask); + enqueueMergeTask(mergeTask); return true; } } @@ -358,6 +361,7 @@ private void runMergeTask(MergeTask mergeTask) { if (mergeTask.supportsIOThrottling()) { ioThrottledMergeTasksCount.decrementAndGet(); } + mergeEventListeners.forEach(l -> l.onMergeCompleted(mergeTask.getOnGoingMerge())); } } @@ -370,6 +374,7 @@ private void abortMergeTask(MergeTask mergeTask) { if (mergeTask.supportsIOThrottling()) { ioThrottledMergeTasksCount.decrementAndGet(); } + mergeEventListeners.forEach(l -> l.onMergeAborted(mergeTask.getOnGoingMerge())); } } @@ -760,6 +765,10 @@ public boolean usingMaxTargetIORateBytesPerSec() { return MAX_IO_RATE.getBytes() == targetIORateBytesPerSec.get(); } + public void registerMergeEventListener(MergeEventListener consumer) { + mergeEventListeners.add(consumer); + } + // exposed for tests Set getRunningMergeTasks() { return runningMergeTasks; diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java index 09efe3e4bf64e..7c74eaf0c625d 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java @@ -65,11 +65,13 @@ public class ThreadPoolMergeScheduler extends MergeScheduler implements Elastics private final AtomicLong doneMergeTaskCount = new AtomicLong(); private final CountDownLatch closedWithNoRunningMerges = new CountDownLatch(1); private volatile boolean closed = false; + private final MergeMemoryEstimateProvider mergeMemoryEstimateProvider; public ThreadPoolMergeScheduler( ShardId shardId, IndexSettings indexSettings, - ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, + MergeMemoryEstimateProvider mergeMemoryEstimateProvider ) { this.shardId = shardId; this.config = indexSettings.getMergeSchedulerConfig(); @@ -81,6 +83,7 @@ public ThreadPoolMergeScheduler( : Double.POSITIVE_INFINITY ); this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; + this.mergeMemoryEstimateProvider = mergeMemoryEstimateProvider; } @Override @@ -176,11 +179,13 @@ MergeTask newMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, Merg // forced merges, as well as merges triggered when closing a shard, always run un-IO-throttled boolean isAutoThrottle = mergeTrigger != MergeTrigger.CLOSING && merge.getStoreMergeInfo().mergeMaxNumSegments() == -1; // IO throttling cannot be toggled for existing merge tasks, only new merge tasks pick up the updated IO throttling setting + long estimateMergeMemoryBytes = mergeMemoryEstimateProvider.estimateMergeMemoryBytes(merge); return new MergeTask( mergeSource, merge, isAutoThrottle && config.isAutoThrottle(), - "Lucene Merge Task #" + submittedMergeTaskCount.incrementAndGet() + " for shard " + shardId + "Lucene Merge Task #" + submittedMergeTaskCount.incrementAndGet() + " for shard " + shardId, + estimateMergeMemoryBytes ); } @@ -313,14 +318,22 @@ class MergeTask implements Runnable { private final OnGoingMerge onGoingMerge; private final MergeRateLimiter rateLimiter; private final boolean supportsIOThrottling; - - MergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, boolean supportsIOThrottling, String name) { + private final long mergeMemoryEstimateBytes; + + MergeTask( + MergeSource mergeSource, + MergePolicy.OneMerge merge, + boolean supportsIOThrottling, + String name, + long mergeMemoryEstimateBytes + ) { this.name = name; this.mergeStartTimeNS = new AtomicLong(); this.mergeSource = mergeSource; this.onGoingMerge = new OnGoingMerge(merge); this.rateLimiter = new MergeRateLimiter(merge.getMergeProgress()); this.supportsIOThrottling = supportsIOThrottling; + this.mergeMemoryEstimateBytes = mergeMemoryEstimateBytes; } Schedule schedule() { @@ -463,6 +476,14 @@ long estimatedRemainingMergeSize() { return Math.max(0L, estimatedMergeSize - rateLimiter.getTotalBytesWritten()); } + public long getMergeMemoryEstimateBytes() { + return mergeMemoryEstimateBytes; + } + + public OnGoingMerge getOnGoingMerge() { + return onGoingMerge; + } + @Override public String toString() { return name + (onGoingMerge.getMerge().isAborted() ? " (aborted)" : ""); diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java index aaffd697189b0..9b74d68326108 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java @@ -19,6 +19,7 @@ import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.PriorityBlockingQueueWithBudget; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.index.merge.OnGoingMerge; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.threadpool.TestThreadPool; import org.elasticsearch.threadpool.ThreadPool; @@ -116,6 +117,8 @@ public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutd Semaphore runMergeSemaphore = new Semaphore(0); ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); AtomicInteger doneMergesCount = new AtomicInteger(0); + AtomicInteger reEnqueuedBackloggedMergesCount = new AtomicInteger(); + AtomicInteger abortedMergesCount = new AtomicInteger(); // submit more merge tasks than there are threads so that some are enqueued for (int i = 0; i < mergesToSubmit; i++) { MergeTask mergeTask = mock(MergeTask.class); @@ -127,6 +130,7 @@ public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutd if (schedule == BACKLOG) { // reenqueue backlogged merge task new Thread(() -> threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask)).start(); + reEnqueuedBackloggedMergesCount.incrementAndGet(); } return schedule; }).when(mergeTask).schedule(); @@ -146,6 +150,7 @@ public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutd } runMergeSemaphore.acquireUninterruptibly(); doneMergesCount.incrementAndGet(); + abortedMergesCount.incrementAndGet(); return null; }).when(mergeTask).abort(); threadPoolMergeExecutorService.submitMergeTask(mergeTask); @@ -157,6 +162,12 @@ public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutd // with the other merge tasks enqueued assertThat(threadPoolExecutor.getQueue().size(), is(mergesToSubmit - mergeExecutorThreadCount)); }); + assertBusy( + () -> assertThat( + countingListener.queued.get(), + equalTo(threadPoolExecutor.getActiveCount() + threadPoolExecutor.getQueue().size() + reEnqueuedBackloggedMergesCount.get()) + ) + ); // shutdown prevents new merge tasks to be enqueued but existing ones should be allowed to continue testThreadPool.shutdown(); // assert all executors, except the merge one, are terminated @@ -197,6 +208,8 @@ public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutd assertTrue(threadPoolExecutor.isTerminated()); assertTrue(threadPoolMergeExecutorService.allDone()); }); + assertThat(countingListener.aborted.get() + countingListener.completed.get(), equalTo(doneMergesCount.get())); + assertThat(countingListener.aborted.get(), equalTo(abortedMergesCount.get())); } public void testTargetIORateChangesWhenSubmittingMergeTasks() throws Exception { diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java index d80b53f441884..cdadf85870622 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -78,7 +78,8 @@ public void testMergesExecuteInSizeOrder() throws IOException { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), IndexSettingsModule.newIndexSettings("index", Settings.EMPTY), - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { List executedMergesList = new ArrayList<>(); @@ -120,7 +121,8 @@ public void testSimpleMergeTaskBacklogging() { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ); // more merge tasks than merge threads int mergeCount = mergeExecutorThreadCount + randomIntBetween(1, 5); @@ -153,7 +155,8 @@ public void testSimpleMergeTaskReEnqueueingBySize() { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ); // sort backlogged merges by size PriorityQueue backloggedMergeTasks = new PriorityQueue<>( @@ -370,7 +373,8 @@ public void testMergeSourceWithFollowUpMergesRunSequentially() throws Exception ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), IndexSettingsModule.newIndexSettings("index", settings), - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { MergeSource mergeSource = mock(MergeSource.class); @@ -446,7 +450,8 @@ public void testMergesRunConcurrently() throws Exception { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), IndexSettingsModule.newIndexSettings("index", settings), - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { // at least 1 extra merge than there are concurrently allowed @@ -533,7 +538,8 @@ public void testSchedulerCloseWaitsForRunningMerge() throws Exception { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), IndexSettingsModule.newIndexSettings("index", settings), - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { CountDownLatch mergeDoneLatch = new CountDownLatch(1); @@ -605,7 +611,8 @@ public void testAutoIOThrottleForMergeTasksWhenSchedulerDisablesIt() throws Exce ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), indexSettings, - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); @@ -634,7 +641,8 @@ public void testAutoIOThrottleForMergeTasks() throws Exception { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), indexSettings, - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); @@ -650,7 +658,8 @@ public void testAutoIOThrottleForMergeTasks() throws Exception { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), indexSettings, - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { // merge submitted upon closing @@ -666,7 +675,8 @@ public void testAutoIOThrottleForMergeTasks() throws Exception { ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( new ShardId("index", "_na_", 1), indexSettings, - threadPoolMergeExecutorService + threadPoolMergeExecutorService, + merge -> 0 ) ) { // merge submitted upon closing @@ -697,7 +707,7 @@ static class TestThreadPoolMergeScheduler extends ThreadPoolMergeScheduler { IndexSettings indexSettings, ThreadPoolMergeExecutorService threadPoolMergeExecutorService ) { - super(shardId, indexSettings, threadPoolMergeExecutorService); + super(shardId, indexSettings, threadPoolMergeExecutorService, merge -> 0); } @Override From 9eef088517b179817db6eab36d42ce3fb38a961f Mon Sep 17 00:00:00 2001 From: Brian Rothermich <35897794+BrianRothermich@users.noreply.github.com> Date: Thu, 22 May 2025 16:52:53 -0400 Subject: [PATCH 11/14] Bring over merge scheduler features from stateless (#128155) Relates to an effort to consolidate the stateless merge scheduler with the current (stateful) merge scheduler from main ES. This PR brings over features required to maintain parity with the stateless scheduler. Specifically, a few methods are added for the stateless scheduler to override: Adds an overridable method shouldSkipMerge to test for skipping merges Adds 2 additional lifecycle callbacks to the scheduler for when a merge is enqueued and when a merge is executed or aborted. This is used by stateless to track active + queued merges per-shard Adds overridable methods for enabling/disabling IO/thread/merge count throttling Other functionality required by the stateless merge scheduler can use the existing callbacks from the stateful scheduler: beforeMerge can be overridden to prewarm afterMerge can be overridden to refresh after big merges Relates ES-10264 --------- Co-authored-by: elasticsearchmachine --- .../engine/ThreadPoolMergeScheduler.java | 67 +++++++++++++++++-- .../engine/ThreadPoolMergeSchedulerTests.java | 25 +++++++ 2 files changed, 85 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java index 7c74eaf0c625d..78a9695bea540 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java @@ -67,6 +67,14 @@ public class ThreadPoolMergeScheduler extends MergeScheduler implements Elastics private volatile boolean closed = false; private final MergeMemoryEstimateProvider mergeMemoryEstimateProvider; + /** + * Creates a thread-pool-based merge scheduler that runs merges in a thread pool. + * + * @param shardId the shard id associated with this merge scheduler + * @param indexSettings used to obtain the {@link MergeSchedulerConfig} + * @param threadPoolMergeExecutorService the executor service used to execute merge tasks from this scheduler + * @param mergeMemoryEstimateProvider provides an estimate for how much memory a merge will take + */ public ThreadPoolMergeScheduler( ShardId shardId, IndexSettings indexSettings, @@ -146,6 +154,16 @@ protected void beforeMerge(OnGoingMerge merge) {} */ protected void afterMerge(OnGoingMerge merge) {} + /** + * A callback allowing for custom logic when a merge is queued. + */ + protected void mergeQueued(OnGoingMerge merge) {} + + /** + * A callback allowing for custom logic after a merge is executed or aborted. + */ + protected void mergeExecutedOrAborted(OnGoingMerge merge) {} + /** * A callback that's invoked when indexing should throttle down indexing in order to let merging to catch up. */ @@ -157,6 +175,34 @@ protected void enableIndexingThrottling(int numRunningMerges, int numQueuedMerge */ protected void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) {} + /** + * Returns true if scheduled merges should be skipped (aborted) + */ + protected boolean shouldSkipMerge() { + return false; + } + + /** + * Returns true if IO-throttling is enabled + */ + protected boolean isAutoThrottle() { + return config.isAutoThrottle(); + } + + /** + * Returns the maximum number of active merges before being throttled + */ + protected int getMaxMergeCount() { + return config.getMaxMergeCount(); + } + + /** + * Returns the maximum number of threads running merges before being throttled + */ + protected int getMaxThreadCount() { + return config.getMaxThreadCount(); + } + /** * A callback for exceptions thrown while merging. */ @@ -168,6 +214,7 @@ protected void handleMergeException(Throwable t) { boolean submitNewMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, MergeTrigger mergeTrigger) { try { MergeTask mergeTask = newMergeTask(mergeSource, merge, mergeTrigger); + mergeQueued(mergeTask.onGoingMerge); return threadPoolMergeExecutorService.submitMergeTask(mergeTask); } finally { checkMergeTaskThrottling(); @@ -183,7 +230,7 @@ MergeTask newMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, Merg return new MergeTask( mergeSource, merge, - isAutoThrottle && config.isAutoThrottle(), + isAutoThrottle && isAutoThrottle(), "Lucene Merge Task #" + submittedMergeTaskCount.incrementAndGet() + " for shard " + shardId, estimateMergeMemoryBytes ); @@ -193,7 +240,7 @@ private void checkMergeTaskThrottling() { long submittedMergesCount = submittedMergeTaskCount.get(); long doneMergesCount = doneMergeTaskCount.get(); int runningMergesCount = runningMergeTasks.size(); - int configuredMaxMergeCount = config.getMaxMergeCount(); + int configuredMaxMergeCount = getMaxMergeCount(); // both currently running and enqueued merge tasks are considered "active" for throttling purposes int activeMerges = (int) (submittedMergesCount - doneMergesCount); if (activeMerges > configuredMaxMergeCount @@ -223,7 +270,12 @@ synchronized Schedule schedule(MergeTask mergeTask) { if (closed) { // do not run or backlog tasks when closing the merge scheduler, instead abort them return Schedule.ABORT; - } else if (runningMergeTasks.size() < config.getMaxThreadCount()) { + } else if (shouldSkipMerge()) { + if (verbose()) { + message(String.format(Locale.ROOT, "skipping merge task %s", mergeTask)); + } + return Schedule.ABORT; + } else if (runningMergeTasks.size() < getMaxThreadCount()) { boolean added = runningMergeTasks.put(mergeTask.onGoingMerge.getMerge(), mergeTask) == null; assert added : "starting merge task [" + mergeTask + "] registered as already running"; return Schedule.RUN; @@ -244,8 +296,9 @@ synchronized void mergeTaskFinishedRunning(MergeTask mergeTask) { maybeSignalAllMergesDoneAfterClose(); } - private void mergeTaskDone() { + private void mergeTaskDone(OnGoingMerge merge) { doneMergeTaskCount.incrementAndGet(); + mergeExecutedOrAborted(merge); checkMergeTaskThrottling(); } @@ -256,7 +309,7 @@ private synchronized void maybeSignalAllMergesDoneAfterClose() { } private synchronized void enqueueBackloggedTasks() { - int maxBackloggedTasksToEnqueue = config.getMaxThreadCount() - runningMergeTasks.size(); + int maxBackloggedTasksToEnqueue = getMaxThreadCount() - runningMergeTasks.size(); // enqueue all backlogged tasks when closing, as the queue expects all backlogged tasks to always be enqueued back while (closed || maxBackloggedTasksToEnqueue-- > 0) { MergeTask backloggedMergeTask = backloggedMergeTasks.poll(); @@ -415,7 +468,7 @@ public void run() { try { mergeTaskFinishedRunning(this); } finally { - mergeTaskDone(); + mergeTaskDone(onGoingMerge); } try { // kick-off any follow-up merge @@ -459,7 +512,7 @@ void abort() { if (verbose()) { message(String.format(Locale.ROOT, "merge task %s end abort", this)); } - mergeTaskDone(); + mergeTaskDone(onGoingMerge); } } diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java index cdadf85870622..156dcf581ec9c 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -691,6 +691,31 @@ public void testAutoIOThrottleForMergeTasks() throws Exception { } } + public void testMergeSchedulerAbortsMergeWhenShouldSkipMergeIsTrue() { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + // build a scheduler that always returns true for shouldSkipMerge + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", Settings.builder().build()), + threadPoolMergeExecutorService, + merge -> 0 + ) { + @Override + protected boolean shouldSkipMerge() { + return true; + } + }; + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + MergeTask mergeTask = threadPoolMergeScheduler.newMergeTask(mergeSource, oneMerge, randomFrom(MergeTrigger.values())); + // verify that calling schedule on the merge task indicates the merge should be aborted + Schedule schedule = threadPoolMergeScheduler.schedule(mergeTask); + assertThat(schedule, is(Schedule.ABORT)); + } + private static MergeInfo getNewMergeInfo(long estimatedMergeBytes) { return getNewMergeInfo(estimatedMergeBytes, randomFrom(-1, randomNonNegativeInt())); } From 6aaaf52f513fdfae5529fcf67ea75686d5cc0978 Mon Sep 17 00:00:00 2001 From: Pooya Salehi Date: Thu, 27 Mar 2025 12:32:19 +0100 Subject: [PATCH 12/14] Move MergeMemoryEstimator (#125686) Relates ES-10961 --- .../index/engine/InternalEngine.java | 9 ++ .../index/engine/MergeMemoryEstimator.java | 118 ++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 server/src/main/java/org/elasticsearch/index/engine/MergeMemoryEstimator.java diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index aefd43466d617..325d552bcc474 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -3548,4 +3548,13 @@ T performActionWithDirectoryReader(SearcherScope scope, CheckedFunctionLucene issue) + * We can work iteratively in providing estimations for different types of fields and vector encodings. + */ +public class MergeMemoryEstimator { + + // Determined empirically by using Accountable.ramBytesUsed() during merges on Lucene using an instrumented build of Lucene. + // Didn't adapted the ramBytesUsed() code for this as it depends on graph levels and size for non-zero levels, which are difficult + // to estimate without actually building the graph. + public static final long HNSW_PER_DOC_ESTIMATION = 348L; + + /** + * Estimates the memory, in bytes, needed to merge the segments of the given merge. + */ + public static long estimateMergeMemory(MergePolicy.OneMerge merge, IndexReader indexReader) { + assert merge.segments.isEmpty() == false; + + long memoryNeeded = 0; + Map segments = merge.segments.stream().collect(Collectors.toMap(s -> s.info.name, s -> s)); + List leaves = indexReader.leaves(); + SegmentReader segmentReader = null; + for (LeafReaderContext leafReaderContext : leaves) { + segmentReader = Lucene.segmentReader(leafReaderContext.reader()); + String segmentName = segmentReader.getSegmentName(); + SegmentCommitInfo segmentCommitInfo = segments.get(segmentName); + if (segmentCommitInfo != null) { + memoryNeeded += estimateMergeMemory(segmentCommitInfo, segmentReader); + segments.remove(segmentName); + if (segments.isEmpty()) { + break; + } + } + } + + // Estimate segments without readers - the searcher may not have been refreshed yet, so estimate them with the field info from + // the last segment reader + if (segmentReader != null) { + for (SegmentCommitInfo segmentCommitInfo : segments.values()) { + memoryNeeded += estimateMergeMemory(segmentCommitInfo, segmentReader); + } + } + + return memoryNeeded; + } + + private static long estimateMergeMemory(SegmentCommitInfo segmentCommitInfo, SegmentReader reader) { + long maxMem = 0; + for (FieldInfo fieldInfo : reader.getFieldInfos()) { + maxMem = Math.max(maxMem, estimateFieldMemory(fieldInfo, segmentCommitInfo, reader)); + } + return maxMem; + } + + private static long estimateFieldMemory(FieldInfo fieldInfo, SegmentCommitInfo segmentCommitInfo, SegmentReader segmentReader) { + + long maxMem = 0; + if (fieldInfo.hasVectorValues()) { + maxMem = Math.max(maxMem, estimateVectorFieldMemory(fieldInfo, segmentCommitInfo, segmentReader)); + } + // TODO Work on estimations on other field infos when / if needed + + return maxMem; + } + + private static long estimateVectorFieldMemory(FieldInfo fieldInfo, SegmentCommitInfo segmentCommitInfo, SegmentReader segmentReader) { + KnnVectorsReader vectorsReader = segmentReader.getVectorReader(); + if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader perFieldKnnVectorsFormat) { + vectorsReader = perFieldKnnVectorsFormat.getFieldReader(fieldInfo.getName()); + } + + return getVectorFieldEstimation(fieldInfo, segmentCommitInfo, vectorsReader); + } + + private static long getVectorFieldEstimation(FieldInfo fieldInfo, SegmentCommitInfo segmentCommitInfo, KnnVectorsReader vectorsReader) { + int numDocs = segmentCommitInfo.info.maxDoc() - segmentCommitInfo.getDelCount(); + if (vectorsReader instanceof Lucene99HnswVectorsReader) { + return numDocs * HNSW_PER_DOC_ESTIMATION; + + } else { + // Dominated by the heap byte buffer size used to write each vector + if (fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32) { + return fieldInfo.getVectorDimension() * VectorEncoding.FLOAT32.byteSize; + } + // Byte does not use buffering for writing but the IndexOutput directly + return 0; + } + } +} From 96b1677abc057aad86ad38ce6e5b1f42fd6d5ee2 Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Mon, 9 Jun 2025 10:24:38 +0300 Subject: [PATCH 13/14] Remove unused project resolver reference --- .../src/main/java/org/elasticsearch/indices/IndicesService.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesService.java b/server/src/main/java/org/elasticsearch/indices/IndicesService.java index c5711341a8a56..7cb92ec7b3f14 100644 --- a/server/src/main/java/org/elasticsearch/indices/IndicesService.java +++ b/server/src/main/java/org/elasticsearch/indices/IndicesService.java @@ -316,7 +316,6 @@ protected void doStart() { clusterService.getClusterSettings(), nodeEnv ); - this.projectResolver = builder.projectResolver; this.client = builder.client; this.featureService = builder.featureService; this.idFieldDataEnabled = INDICES_ID_FIELD_DATA_ENABLED_SETTING.get(clusterService.getSettings()); From fb018d3c3ecab5af21dcf96e4dd05034afdb057a Mon Sep 17 00:00:00 2001 From: Albert Zaharovits Date: Mon, 9 Jun 2025 10:31:32 +0300 Subject: [PATCH 14/14] Compilation fix --- .../src/test/java/org/elasticsearch/index/IndexModuleTests.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java index 62f95ce18bccb..81ca842f9ad65 100644 --- a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -37,6 +37,7 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.UUIDs; import org.elasticsearch.common.breaker.CircuitBreaker; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Settings;