-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Add metrics max queue latency #133959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
nicktindall
merged 12 commits into
elastic:main
from
nicktindall:ES-12631_add_metrics_max_queue_latency
Sep 4, 2025
Merged
Add metrics max queue latency #133959
Changes from 4 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
c4db2e3
Publish max queue latency metric
nicktindall c2970be
Add test, fix units in metrics
nicktindall 2fc4dde
Make metric naming consistent with DesiredBalanceMetrics
nicktindall 151086e
Merge branch 'main' into ES-12631_add_metrics_max_queue_latency
nicktindall 02053c7
Use DesiredBalanceMetrics rather than adding a new class
nicktindall e1b9dfc
Simplify thread-pool blocking
nicktindall c28de52
Simplify thread-pool blocking
nicktindall 075be86
Make sure blocked task is dequeued before refreshing cluster info
nicktindall 19fc291
Revert "Make sure blocked task is dequeued before refreshing cluster …
nicktindall 0bb4e53
Use common refreshClusterInfo, don't use masterOnlyNode
nicktindall 94955a1
Merge remote-tracking branch 'origin/main' into ES-12631_add_metrics_…
nicktindall 5f4cf5f
Merge remote-tracking branch 'origin/main' into ES-12631_add_metrics_…
nicktindall File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| import org.elasticsearch.cluster.routing.RoutingNodes; | ||
| import org.elasticsearch.cluster.routing.ShardRouting; | ||
| import org.elasticsearch.cluster.routing.UnassignedInfo; | ||
| import org.elasticsearch.cluster.routing.allocation.AllocationDeciderMetrics; | ||
| import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings; | ||
| import org.elasticsearch.cluster.service.ClusterService; | ||
| import org.elasticsearch.common.settings.Settings; | ||
|
|
@@ -36,6 +37,8 @@ | |
| import org.elasticsearch.index.shard.ShardPath; | ||
| import org.elasticsearch.index.store.StoreStats; | ||
| import org.elasticsearch.plugins.Plugin; | ||
| import org.elasticsearch.plugins.PluginsService; | ||
| import org.elasticsearch.telemetry.TestTelemetryPlugin; | ||
| import org.elasticsearch.test.ClusterServiceUtils; | ||
| import org.elasticsearch.test.ESIntegTestCase; | ||
| import org.elasticsearch.test.transport.MockTransportService; | ||
|
|
@@ -45,18 +48,28 @@ | |
| import java.nio.file.Path; | ||
| import java.util.ArrayList; | ||
| import java.util.Collection; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.CyclicBarrier; | ||
|
|
||
| import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; | ||
| import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; | ||
| import static org.hamcrest.Matchers.everyItem; | ||
| import static org.hamcrest.Matchers.greaterThanOrEqualTo; | ||
| import static org.hamcrest.Matchers.hasSize; | ||
|
|
||
| @ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0) | ||
| public class WriteLoadConstraintDeciderIT extends ESIntegTestCase { | ||
|
|
||
| @Override | ||
| @SuppressWarnings("unchecked") | ||
| protected Collection<Class<? extends Plugin>> getMockPlugins() { | ||
| return CollectionUtils.appendToCopy(super.nodePlugins(), MockTransportService.TestPlugin.class); | ||
| return CollectionUtils.appendToCopyNoNullElements( | ||
| super.nodePlugins(), | ||
| MockTransportService.TestPlugin.class, | ||
| TestTelemetryPlugin.class | ||
| ); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -227,11 +240,7 @@ public void testHighNodeWriteLoadPreventsNewShardAllocation() { | |
| */ | ||
|
|
||
| logger.info("---> Refreshing the cluster info to pull in the dummy thread pool stats with a hot-spotting node"); | ||
| final InternalClusterInfoService clusterInfoService = asInstanceOf( | ||
| InternalClusterInfoService.class, | ||
| internalCluster().getInstance(ClusterInfoService.class, masterName) | ||
| ); | ||
| ClusterInfoServiceUtils.refresh(clusterInfoService); | ||
| refreshClusterInfo(masterName); | ||
|
|
||
| logger.info( | ||
| "---> Update the filter to exclude " + firstDataNodeName + " so that shards will be reassigned away to the other nodes" | ||
|
|
@@ -254,6 +263,76 @@ public void testHighNodeWriteLoadPreventsNewShardAllocation() { | |
| })); | ||
| } | ||
|
|
||
| public void testMaxQueueLatencyMetricIsPublished() { | ||
| final Settings settings = Settings.builder() | ||
| .put( | ||
| WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_ENABLED_SETTING.getKey(), | ||
| WriteLoadConstraintSettings.WriteLoadDeciderStatus.ENABLED | ||
| ) | ||
| .build(); | ||
| final String masterName = internalCluster().startMasterOnlyNode(settings); | ||
| final var dataNodes = internalCluster().startDataOnlyNodes(2, settings); | ||
| ensureStableCluster(3); | ||
|
|
||
| // Refresh cluster info (should trigger polling) | ||
| refreshClusterInfo(masterName); | ||
|
|
||
| Map<String, Long> mostRecentQueueLatencyMetrics = getMostRecentQueueLatencyMetrics(dataNodes); | ||
| assertThat(mostRecentQueueLatencyMetrics.keySet(), hasSize(dataNodes.size())); | ||
| assertThat(mostRecentQueueLatencyMetrics.values(), everyItem(greaterThanOrEqualTo(0L))); | ||
|
|
||
| final String dataNodeToDelay = randomFrom(dataNodes); | ||
| final ThreadPool threadPoolToDelay = internalCluster().getInstance(ThreadPool.class, dataNodeToDelay); | ||
|
|
||
| // Fill the write thread pool | ||
| final int writeThreadPoolSize = threadPoolToDelay.info(ThreadPool.Names.WRITE).getMax(); | ||
| final CyclicBarrier delayLatch = new CyclicBarrier(writeThreadPoolSize + 1); | ||
| for (int i = 0; i < writeThreadPoolSize; i++) { | ||
| threadPoolToDelay.executor(ThreadPool.Names.WRITE).execute(() -> { | ||
| safeAwait(delayLatch); | ||
| safeAwait(delayLatch); | ||
|
||
| }); | ||
| } | ||
| safeAwait(delayLatch); | ||
| // Submit a task that will be delayed | ||
| threadPoolToDelay.executor(ThreadPool.Names.WRITE).execute(() -> { | ||
| // Doesn't need to do anything | ||
| }); | ||
| final long delayMillis = randomIntBetween(100, 200); | ||
| safeSleep(delayMillis); | ||
| // Unblock the pool | ||
| safeAwait(delayLatch); | ||
|
|
||
| refreshClusterInfo(masterName); | ||
| mostRecentQueueLatencyMetrics = getMostRecentQueueLatencyMetrics(dataNodes); | ||
| assertThat(mostRecentQueueLatencyMetrics.keySet(), hasSize(dataNodes.size())); | ||
| assertThat(mostRecentQueueLatencyMetrics.get(dataNodeToDelay), greaterThanOrEqualTo(delayMillis)); | ||
| } | ||
|
|
||
| private static void refreshClusterInfo(String masterName) { | ||
| final InternalClusterInfoService clusterInfoService = asInstanceOf( | ||
| InternalClusterInfoService.class, | ||
| internalCluster().getInstance(ClusterInfoService.class, masterName) | ||
| ); | ||
| ClusterInfoServiceUtils.refresh(clusterInfoService); | ||
| } | ||
|
|
||
| private static Map<String, Long> getMostRecentQueueLatencyMetrics(List<String> dataNodes) { | ||
| final Map<String, Long> measurements = new HashMap<>(); | ||
| for (String nodeName : dataNodes) { | ||
| PluginsService pluginsService = internalCluster().getInstance(PluginsService.class, nodeName); | ||
| final TestTelemetryPlugin telemetryPlugin = pluginsService.filterPlugins(TestTelemetryPlugin.class).findFirst().orElseThrow(); | ||
| telemetryPlugin.collect(); | ||
| final var maxLatencyValues = telemetryPlugin.getLongGaugeMeasurement( | ||
| AllocationDeciderMetrics.WRITE_LOAD_DECIDER_MAX_LATENCY_VALUE | ||
| ); | ||
| if (maxLatencyValues.isEmpty() == false) { | ||
| measurements.put(nodeName, maxLatencyValues.getLast().getLong()); | ||
| } | ||
| } | ||
| return measurements; | ||
| } | ||
|
|
||
| /** | ||
| * Verifies that the {@link RoutingNodes} shows that the expected portion of an index's shards are assigned to each node. | ||
| */ | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
.../src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationDeciderMetrics.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.cluster.routing.allocation; | ||
|
|
||
| import org.elasticsearch.telemetry.metric.LongWithAttributes; | ||
| import org.elasticsearch.telemetry.metric.MeterRegistry; | ||
|
|
||
| import java.util.Collection; | ||
| import java.util.function.Supplier; | ||
|
|
||
| /** | ||
| * A place where metrics related to allocation deciders can live | ||
| */ | ||
| public class AllocationDeciderMetrics { | ||
|
|
||
| public static final String WRITE_LOAD_DECIDER_MAX_LATENCY_VALUE = "es.allocator.deciders.write_load.max_latency_value.current"; | ||
|
|
||
| private final MeterRegistry meterRegistry; | ||
|
|
||
| public AllocationDeciderMetrics(MeterRegistry meterRegistry) { | ||
| this.meterRegistry = meterRegistry; | ||
| } | ||
|
|
||
| public void registerWriteLoadDeciderMaxLatencyGauge(Supplier<Collection<LongWithAttributes>> maxLatencySupplier) { | ||
| meterRegistry.registerLongsGauge( | ||
| WRITE_LOAD_DECIDER_MAX_LATENCY_VALUE, | ||
| "max latency for write load decider", | ||
| "ms", | ||
| maxLatencySupplier | ||
| ); | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.