Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/135652.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 135652
summary: DFS search phase per shard duration APM metric
area: Search
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,20 @@

public final class ShardSearchPhaseAPMMetrics implements SearchOperationListener {

public static final String DFS_SEARCH_PHASE_METRIC = "es.search.shards.phases.dfs.duration.histogram";
public static final String QUERY_SEARCH_PHASE_METRIC = "es.search.shards.phases.query.duration.histogram";
public static final String FETCH_SEARCH_PHASE_METRIC = "es.search.shards.phases.fetch.duration.histogram";

private final LongHistogram dfsPhaseMetric;
private final LongHistogram queryPhaseMetric;
private final LongHistogram fetchPhaseMetric;

public ShardSearchPhaseAPMMetrics(MeterRegistry meterRegistry) {
this.dfsPhaseMetric = meterRegistry.registerLongHistogram(
DFS_SEARCH_PHASE_METRIC,
"DFS search phase execution times at the shard level, expressed as a histogram",
"ms"
);
this.queryPhaseMetric = meterRegistry.registerLongHistogram(
QUERY_SEARCH_PHASE_METRIC,
"Query search phase execution times at the shard level, expressed as a histogram",
Expand All @@ -41,6 +48,11 @@ public ShardSearchPhaseAPMMetrics(MeterRegistry meterRegistry) {
);
}

@Override
public void onDfsPhase(SearchContext searchContext, long tookInNanos) {
recordPhaseLatency(dfsPhaseMetric, tookInNanos);
}

@Override
public void onQueryPhase(SearchContext searchContext, long tookInNanos) {
SearchExecutionContext searchExecutionContext = searchContext.getSearchExecutionContext();
Expand All @@ -55,6 +67,10 @@ public void onFetchPhase(SearchContext searchContext, long tookInNanos) {
recordPhaseLatency(fetchPhaseMetric, tookInNanos, searchContext.request(), rangeTimestampFrom);
}

private static void recordPhaseLatency(LongHistogram histogramMetric, long tookInNanos) {
histogramMetric.record(TimeUnit.NANOSECONDS.toMillis(tookInNanos));
}

private static void recordPhaseLatency(
LongHistogram histogramMetric,
long tookInNanos,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import static org.elasticsearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE;
import static org.elasticsearch.index.query.QueryBuilders.simpleQueryStringQuery;
import static org.elasticsearch.index.search.stats.ShardSearchPhaseAPMMetrics.DFS_SEARCH_PHASE_METRIC;
import static org.elasticsearch.index.search.stats.ShardSearchPhaseAPMMetrics.FETCH_SEARCH_PHASE_METRIC;
import static org.elasticsearch.index.search.stats.ShardSearchPhaseAPMMetrics.QUERY_SEARCH_PHASE_METRIC;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
Expand Down Expand Up @@ -92,6 +93,8 @@ public void testMetricsDfsQueryThenFetch() {
client().prepareSearch(indexName).setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(simpleQueryStringQuery("doc1")),
"1"
);
final List<Measurement> dfsMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(DFS_SEARCH_PHASE_METRIC);
assertEquals(num_primaries, dfsMeasurements.size());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you check that the measurement make some sense? For instance, are they always greater than 0? Are they always lower than the total took time?

Copy link
Contributor Author

@chrisparrinello chrisparrinello Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, they're not always greater than zero because we convert nanoseconds to milliseconds before storing them in the histogram so if something took less than a millisecond, we record a zero. This definitely happens in the unit tests. I took a stab at checking against took time but that I means I need to pull apart all of the asserts to get the SearchResponse, for example:

public void testMetricsDfsQueryThenFetch() {
        SearchRequestBuilder requestBuilder = client().prepareSearch(indexName)
            .setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
            .setQuery(simpleQueryStringQuery("doc1"));
        SearchResponse searchResponse = requestBuilder.get();
        try {
            assertNoFailures(searchResponse);
            assertHitCount(searchResponse, 1);
            assertSearchHits(searchResponse, "1");
            final List<Measurement> dfsMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(DFS_SEARCH_PHASE_METRIC);
            assertMeasurements(dfsMeasurements, num_primaries, searchResponse.getTook().millis());
            final List<Measurement> queryMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(QUERY_SEARCH_PHASE_METRIC);
            assertEquals(num_primaries, queryMeasurements.size());
            final List<Measurement> fetchMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(FETCH_SEARCH_PHASE_METRIC);
            assertEquals(1, fetchMeasurements.size());
            assertAttributes(fetchMeasurements, false, false);
        } finally {
            searchResponse.decRef();
        }
    }

where assertMeasurements checks to make sure the measurements are less than or equal to the took time from the response and we have the right number of measurements. Let me know if you want to take this approach and I'll modify all of the tests to make sure we're asserting valid measurements.

About the nanoseconds getting converted to 0 milliseconds, one thought I had was to change the units from milliseconds to microseconds or nanoseconds but the issue is that the underlying OpenTelemetry implementation of the histogram buckets the measurements before reporting to the APM server and there is an upper bound to the buckets (something like 110k) so if you choose the wrong scale you lose precision for measurements greater than 110k. There is a way to control the bucketing but it is deep deep in the OpenTelemetry meter code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, sorry, took time is at the coord level, it's not possible to get it here. I see! and thanks for the explanation about the rounding. And for checking further about precision. I think we are good here!

final List<Measurement> queryMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(QUERY_SEARCH_PHASE_METRIC);
assertEquals(num_primaries, queryMeasurements.size());
final List<Measurement> fetchMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(FETCH_SEARCH_PHASE_METRIC);
Expand All @@ -106,6 +109,8 @@ public void testMetricsDfsQueryThenFetchSystem() {
.setQuery(simpleQueryStringQuery("doc1")),
"1"
);
final List<Measurement> dfsMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(DFS_SEARCH_PHASE_METRIC);
assertEquals(0, dfsMeasurements.size()); // DFS phase not done for index with single shard
final List<Measurement> queryMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(QUERY_SEARCH_PHASE_METRIC);
assertEquals(1, queryMeasurements.size());
final List<Measurement> fetchMeasurements = getTestTelemetryPlugin().getLongHistogramMeasurement(FETCH_SEARCH_PHASE_METRIC);
Expand Down