elastic · piergm · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 7, 2025
diff --git a/docs/changelog/122262.yaml b/docs/changelog/122262.yaml
@@ -0,0 +1,5 @@
+pr: 122262
+summary: Measure search load per index
+area: Search
+type: feature
+issues: []
diff --git a/...in/java/org/elasticsearch/cluster/coordination/SearchIndexTimeTrackingCleanupService.java b/...in/java/org/elasticsearch/cluster/coordination/SearchIndexTimeTrackingCleanupService.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.cluster.coordination;
+
+import org.elasticsearch.cluster.ClusterChangedEvent;
+import org.elasticsearch.cluster.ClusterStateListener;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.search.stats.ShardSearchPerIndexTimeTrackingMetrics;
+
+/**
+ * Service responsible for cleaning up task execution time tracking for deleted indices.
+ * Implements the ClusterStateListener interface to listen for cluster state changes.
+ */
+public class SearchIndexTimeTrackingCleanupService implements ClusterStateListener {
+
+    private ShardSearchPerIndexTimeTrackingMetrics listener;
+
+    /**
+     * Constructor.
+     *
+     * @param listener the listener for shard search time tracking metrics
+     */
+    public SearchIndexTimeTrackingCleanupService(ShardSearchPerIndexTimeTrackingMetrics listener) {
+        this.listener = listener;
+    }
+
+    /**
+     * Called when the cluster state changes. Stops tracking execution time for deleted indices.
+     *
+     * @param event the cluster changed event
+     */
+    @Override
+    public void clusterChanged(ClusterChangedEvent event) {
+        for (Index index : event.indicesDeleted()) {
+            listener.stopTrackingIndex(index.getName());
+        }
+    }
+}
diff --git a/...ain/java/org/elasticsearch/index/search/stats/ShardSearchPerIndexTimeTrackingMetrics.java b/...ain/java/org/elasticsearch/index/search/stats/ShardSearchPerIndexTimeTrackingMetrics.java
@@ -0,0 +1,146 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.search.stats;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.common.ExponentiallyWeightedMovingAverage;
+import org.elasticsearch.core.Tuple;
+import org.elasticsearch.index.shard.IndexShard;
+import org.elasticsearch.index.shard.SearchOperationListener;
+import org.elasticsearch.search.internal.SearchContext;
+
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.LongAdder;
+
+/**
+ * This class implements the {@link SearchOperationListener} interface to track the execution time of search operations
+ * on a per-index basis. It uses a {@link ConcurrentHashMap} to store the execution times and an
+ * {@link ExponentiallyWeightedMovingAverage} to calculate the exponentially weighted moving average (EWMA) of the
+ * execution times.
+ */
+public final class ShardSearchPerIndexTimeTrackingMetrics implements SearchOperationListener {
+
+    private static final Logger logger = LogManager.getLogger(ShardSearchPerIndexTimeTrackingMetrics.class);
+
+    private final ConcurrentHashMap<String, Tuple<LongAdder, ExponentiallyWeightedMovingAverage>> indexExecutionTime;
+
+    private final double ewmaAlpha;
+
+    /**
+     * Constructs a new ShardSearchPerIndexTimeTrackingMetrics instance with the specified EWMA alpha value.
+     *
+     * @param ewmaAlpha the alpha value for the EWMA calculation
+     */
+    public ShardSearchPerIndexTimeTrackingMetrics(double ewmaAlpha) {
+        this.indexExecutionTime = new ConcurrentHashMap<>();
+        this.ewmaAlpha = ewmaAlpha;
+    }
+
+    /**
+     * Tracks the execution time of the query phase of a search operation.
+     *
+     * @param searchContext the search context
+     * @param tookInNanos the time taken in nanoseconds
+     */
+    @Override
+    public void onQueryPhase(SearchContext searchContext, long tookInNanos) {
+        trackExecutionTime(searchContext, tookInNanos);
+    }
+
+    /**
+     * Tracks the execution time of a failed query phase of a search operation.
+     *
+     * @param searchContext the search context
+     * @param tookInNanos the time taken in nanoseconds
+     */
+    @Override
+    public void onFailedQueryPhase(SearchContext searchContext, long tookInNanos) {
+        trackExecutionTime(searchContext, tookInNanos);
+    }
+
+    /**
+     * Tracks the execution time of the fetch phase of a search operation.
+     *
+     * @param searchContext the search context
+     * @param tookInNanos the time taken in nanoseconds
+     */
+    @Override
+    public void onFetchPhase(SearchContext searchContext, long tookInNanos) {
+        trackExecutionTime(searchContext, tookInNanos);
+    }
+
+    /**
+     * Tracks the execution time of a failed fetch phase of a search operation.
+     *
+     * @param searchContext the search context
+     * @param tookInNanos the time taken in nanoseconds
+     */
+    @Override
+    public void onFailedFetchPhase(SearchContext searchContext, long tookInNanos) {
+        trackExecutionTime(searchContext, tookInNanos);
+    }
+
+    /**
+     * Tracks the execution time of a search operation.
+     *
+     * @param searchContext the search context
+     * @param tookInNanos the time taken in nanoseconds
+     */
+    private void trackExecutionTime(SearchContext searchContext, long tookInNanos) {
+        IndexShard indexShard = searchContext.indexShard();
+        if (indexShard != null && indexShard.isSystem() == false) {
+            String indexName = indexShard.shardId().getIndexName();
+            if (indexName != null) {
+                Tuple<LongAdder, ExponentiallyWeightedMovingAverage> t = indexExecutionTime.computeIfAbsent(
+                    indexName,
+                    k -> new Tuple<>(new LongAdder(), new ExponentiallyWeightedMovingAverage(ewmaAlpha, 0))
+                );
+                t.v1().add(tookInNanos);
+                t.v2().addValue(tookInNanos);
+            }
+        }
+    }
+
+    /**
+     * Gets the total execution time for tasks associated with a specific index.
+     *
+     * @param indexName the name of the index
+     * @return the total execution time for the index
+     */
+    public long getSearchLoadPerIndex(String indexName) {
+        Tuple<LongAdder, ExponentiallyWeightedMovingAverage> t = indexExecutionTime.get(indexName);
+        return (t != null) ? t.v1().sum() : 0;
+    }
+
+    /**
+     * Gets the exponentially weighted moving average (EWMA) of the execution time for tasks associated with a specific index name.
+     *
+     * @param indexName the name of the index
+     * @return the EWMA of the execution time for the index
+     */
+    public double getLoadEMWAPerIndex(String indexName) {
+        Tuple<LongAdder, ExponentiallyWeightedMovingAverage> t = indexExecutionTime.get(indexName);
+        return (t != null) ? t.v2().getAverage() : 0;
+    }
+
+    /**
+     * Stops tracking the execution time for tasks associated with a specific index.
+     *
+     * @param indexName the name of the index
+     */
+    public void stopTrackingIndex(String indexName) {
+        if (indexExecutionTime.containsKey(indexName)) {
+            indexExecutionTime.remove(indexName);
+        } else {
+            logger.debug("Trying to stop tracking index [{}] that was never tracked", indexName);
+        }
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/index/search/stats/ShardSearchStats.java b/server/src/main/java/org/elasticsearch/index/search/stats/ShardSearchStats.java
@@ -64,7 +64,7 @@ public void onPreQueryPhase(SearchContext searchContext) {
     }
 
     @Override
-    public void onFailedQueryPhase(SearchContext searchContext) {
+    public void onFailedQueryPhase(SearchContext searchContext, long tookInNanos) {
         computeStats(searchContext, statsHolder -> {
             if (searchContext.hasOnlySuggest()) {
                 statsHolder.suggestCurrent.dec();
@@ -92,7 +92,7 @@ public void onPreFetchPhase(SearchContext searchContext) {
     }
 
     @Override
-    public void onFailedFetchPhase(SearchContext searchContext) {
+    public void onFailedFetchPhase(SearchContext searchContext, long tookInNanos) {
         computeStats(searchContext, statsHolder -> {
             statsHolder.fetchCurrent.dec();
             statsHolder.fetchFailure.inc();

diff --git a/server/src/main/java/org/elasticsearch/index/shard/SearchOperationListener.java b/server/src/main/java/org/elasticsearch/index/shard/SearchOperationListener.java
@@ -30,16 +30,17 @@ default void onPreQueryPhase(SearchContext searchContext) {}
     /**
      * Executed if a query phased failed.
      * @param searchContext the current search context
+     * @param tookInNanos the number of nanoseconds the query execution took
      */
-    default void onFailedQueryPhase(SearchContext searchContext) {}
+    default void onFailedQueryPhase(SearchContext searchContext, long tookInNanos) {}
 
     /**
      * Executed after the query phase successfully finished.
      * Note: this is not invoked if the query phase execution failed.
      * @param searchContext the current search context
      * @param tookInNanos the number of nanoseconds the query execution took
      *
-     * @see #onFailedQueryPhase(SearchContext)
+     * @see #onFailedQueryPhase(SearchContext, long)
      */
     default void onQueryPhase(SearchContext searchContext, long tookInNanos) {}
 
@@ -52,16 +53,17 @@ default void onPreFetchPhase(SearchContext searchContext) {}
     /**
      * Executed if a fetch phased failed.
      * @param searchContext the current search context
+     * @param tookInNanos the number of nanoseconds the query execution took
      */
-    default void onFailedFetchPhase(SearchContext searchContext) {}
+    default void onFailedFetchPhase(SearchContext searchContext, long tookInNanos) {}
 
     /**
      * Executed after the fetch phase successfully finished.
      * Note: this is not invoked if the fetch phase execution failed.
      * @param searchContext the current search context
      * @param tookInNanos the number of nanoseconds the fetch execution took
      *
-     * @see #onFailedFetchPhase(SearchContext)
+     * @see #onFailedFetchPhase(SearchContext, long)
      */
     default void onFetchPhase(SearchContext searchContext, long tookInNanos) {}
 
@@ -128,10 +130,10 @@ public void onPreQueryPhase(SearchContext searchContext) {
         }
 
         @Override
-        public void onFailedQueryPhase(SearchContext searchContext) {
+        public void onFailedQueryPhase(SearchContext searchContext, long tookInNanos) {
             for (SearchOperationListener listener : listeners) {
                 try {
-                    listener.onFailedQueryPhase(searchContext);
+                    listener.onFailedQueryPhase(searchContext, tookInNanos);
                 } catch (Exception e) {
                     logger.warn(() -> "onFailedQueryPhase listener [" + listener + "] failed", e);
                 }
@@ -161,10 +163,10 @@ public void onPreFetchPhase(SearchContext searchContext) {
         }
 
         @Override
-        public void onFailedFetchPhase(SearchContext searchContext) {
+        public void onFailedFetchPhase(SearchContext searchContext, long tookInNanos) {
             for (SearchOperationListener listener : listeners) {
                 try {
-                    listener.onFailedFetchPhase(searchContext);
+                    listener.onFailedFetchPhase(searchContext, tookInNanos);
                 } catch (Exception e) {
                     logger.warn(() -> "onFailedFetchPhase listener [" + listener + "] failed", e);
                 }

@@ -41,6 +41,7 @@
 import org.elasticsearch.cluster.coordination.CoordinationDiagnosticsService;
 import org.elasticsearch.cluster.coordination.Coordinator;
 import org.elasticsearch.cluster.coordination.MasterHistoryService;
+import org.elasticsearch.cluster.coordination.SearchIndexTimeTrackingCleanupService;
 import org.elasticsearch.cluster.coordination.StableMasterHealthIndicatorService;
 import org.elasticsearch.cluster.metadata.DataStreamFailureStoreSettings;
 import org.elasticsearch.cluster.metadata.DataStreamGlobalRetentionSettings;
@@ -82,6 +83,7 @@
 import org.elasticsearch.common.settings.SettingsModule;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.common.util.PageCacheRecycler;
+import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.core.IOUtils;
 import org.elasticsearch.core.SuppressForbidden;
@@ -120,6 +122,7 @@
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.mapper.MapperMetrics;
 import org.elasticsearch.index.mapper.SourceFieldMetrics;
+import org.elasticsearch.index.search.stats.ShardSearchPerIndexTimeTrackingMetrics;
 import org.elasticsearch.index.search.stats.ShardSearchPhaseAPMMetrics;
 import org.elasticsearch.index.shard.SearchOperationListener;
 import org.elasticsearch.indices.ExecutorSelector;
@@ -818,10 +821,17 @@ private void construct(
             threadPool::relativeTimeInMillis
         );
         MapperMetrics mapperMetrics = new MapperMetrics(sourceFieldMetrics);
+
+        ShardSearchPerIndexTimeTrackingMetrics listener = new ShardSearchPerIndexTimeTrackingMetrics(
+            EsExecutors.TaskTrackingConfig.DEFAULT.getEwmaAlpha()
+        );
         final List<SearchOperationListener> searchOperationListeners = List.of(
-            new ShardSearchPhaseAPMMetrics(telemetryProvider.getMeterRegistry())
+            new ShardSearchPhaseAPMMetrics(telemetryProvider.getMeterRegistry()),
+            listener
         );
 
+        clusterService.addListener(new SearchIndexTimeTrackingCleanupService(listener));
+
         List<? extends SlowLogFieldProvider> slowLogFieldProviders = pluginsService.loadServiceProviders(SlowLogFieldProvider.class);
         // NOTE: the response of index/search slow log fields below must be calculated dynamically on every call
         // because the responses may change dynamically at runtime

diff --git a/server/src/main/java/org/elasticsearch/search/SearchService.java b/server/src/main/java/org/elasticsearch/search/SearchService.java
@@ -2018,9 +2018,9 @@ public void close() {
                     }
                 } else {
                     if (fetch) {
-                        listener.onFailedFetchPhase(context);
+                        listener.onFailedFetchPhase(context, System.nanoTime() - time);
                     } else {
-                        listener.onFailedQueryPhase(context);
+                        listener.onFailedQueryPhase(context, System.nanoTime() - time);
                     }
                 }
             }

diff --git a/server/src/test/java/org/elasticsearch/index/shard/SearchOperationListenerTests.java b/server/src/test/java/org/elasticsearch/index/shard/SearchOperationListenerTests.java
@@ -51,7 +51,7 @@ public void onPreQueryPhase(SearchContext searchContext) {
             }
 
             @Override
-            public void onFailedQueryPhase(SearchContext searchContext) {
+            public void onFailedQueryPhase(SearchContext searchContext, long tookInNanos) {
                 assertNotNull(searchContext);
                 failedQuery.incrementAndGet();
             }
@@ -71,7 +71,7 @@ public void onPreFetchPhase(SearchContext searchContext) {
             }
 
             @Override
-            public void onFailedFetchPhase(SearchContext searchContext) {
+            public void onFailedFetchPhase(SearchContext searchContext, long tookInNanos) {
                 assertNotNull(searchContext);
                 failedFetch.incrementAndGet();
             }
@@ -191,7 +191,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest
             assertEquals(0, freeScrollContext.get());
             assertEquals(0, validateSearchContext.get());
 
-            compositeListener.onFailedFetchPhase(ctx);
+            compositeListener.onFailedFetchPhase(ctx, -1);
             assertEquals(2, preFetch.get());
             assertEquals(2, preQuery.get());
             assertEquals(2, failedFetch.get());
@@ -204,7 +204,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest
             assertEquals(0, freeScrollContext.get());
             assertEquals(0, validateSearchContext.get());
 
-            compositeListener.onFailedQueryPhase(ctx);
+            compositeListener.onFailedQueryPhase(ctx, -1);
             assertEquals(2, preFetch.get());
             assertEquals(2, preQuery.get());
             assertEquals(2, failedFetch.get());