Skip to content

Commit 0ed5959

Browse files
authored
Collect query metrics on search nodes (#114267)
When I added the query/fetch metrics, I overlooked that non-primary shards were being skipped during metrics collection, and the stateful tests didn't catch it. This change ensures that search metrics are now collected from every shard copy.
1 parent 43e5258 commit 0ed5959

File tree

2 files changed

+111
-115
lines changed

2 files changed

+111
-115
lines changed

server/src/internalClusterTest/java/org/elasticsearch/monitor/metrics/IndicesMetricsIT.java

Lines changed: 72 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,14 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
107107
static final String LOGSDB_INDEXING_TIME = "es.indices.logsdb.indexing.time";
108108
static final String LOGSDB_INDEXING_FAILURE = "es.indices.logsdb.indexing.failure.total";
109109

110-
public void testIndicesMetrics() throws Exception {
111-
String node = internalCluster().startNode();
110+
public void testIndicesMetrics() {
111+
String indexNode = internalCluster().startNode();
112112
ensureStableCluster(1);
113-
final TestTelemetryPlugin telemetry = internalCluster().getInstance(PluginsService.class, node)
113+
TestTelemetryPlugin telemetry = internalCluster().getInstance(PluginsService.class, indexNode)
114114
.filterPlugins(TestTelemetryPlugin.class)
115115
.findFirst()
116116
.orElseThrow();
117-
final IndicesService indicesService = internalCluster().getInstance(IndicesService.class, node);
117+
IndicesService indicesService = internalCluster().getInstance(IndicesService.class, indexNode);
118118
var indexing0 = indicesService.stats(CommonStatsFlags.ALL, false).getIndexing().getTotal();
119119
telemetry.resetMeter();
120120
long numStandardIndices = randomIntBetween(1, 5);
@@ -131,19 +131,12 @@ public void testIndicesMetrics() throws Exception {
131131
STANDARD_BYTES_SIZE,
132132
greaterThan(0L),
133133

134-
TIME_SERIES_INDEX_COUNT,
135-
equalTo(0L),
136-
TIME_SERIES_DOCS_COUNT,
137-
equalTo(0L),
138-
TIME_SERIES_BYTES_SIZE,
139-
equalTo(0L),
140-
141-
LOGSDB_INDEX_COUNT,
142-
equalTo(0L),
143-
LOGSDB_DOCS_COUNT,
144-
equalTo(0L),
145-
LOGSDB_BYTES_SIZE,
146-
equalTo(0L)
134+
STANDARD_INDEXING_COUNT,
135+
equalTo(numStandardDocs),
136+
STANDARD_INDEXING_TIME,
137+
greaterThanOrEqualTo(0L),
138+
STANDARD_INDEXING_FAILURE,
139+
equalTo(indexing1.getIndexFailedCount() - indexing0.getIndexCount())
147140
)
148141
);
149142

@@ -154,26 +147,19 @@ public void testIndicesMetrics() throws Exception {
154147
telemetry,
155148
2,
156149
Map.of(
157-
STANDARD_INDEX_COUNT,
158-
equalTo(numStandardIndices),
159-
STANDARD_DOCS_COUNT,
160-
equalTo(numStandardDocs),
161-
STANDARD_BYTES_SIZE,
162-
greaterThan(0L),
163-
164150
TIME_SERIES_INDEX_COUNT,
165151
equalTo(numTimeSeriesIndices),
166152
TIME_SERIES_DOCS_COUNT,
167153
equalTo(numTimeSeriesDocs),
168154
TIME_SERIES_BYTES_SIZE,
169155
greaterThan(20L),
170156

171-
LOGSDB_INDEX_COUNT,
172-
equalTo(0L),
173-
LOGSDB_DOCS_COUNT,
174-
equalTo(0L),
175-
LOGSDB_BYTES_SIZE,
176-
equalTo(0L)
157+
TIME_SERIES_INDEXING_COUNT,
158+
equalTo(numTimeSeriesDocs),
159+
TIME_SERIES_INDEXING_TIME,
160+
greaterThanOrEqualTo(0L),
161+
TIME_SERIES_INDEXING_FAILURE,
162+
equalTo(indexing2.getIndexFailedCount() - indexing1.getIndexFailedCount())
177163
)
178164
);
179165

@@ -184,72 +170,70 @@ public void testIndicesMetrics() throws Exception {
184170
telemetry,
185171
3,
186172
Map.of(
187-
STANDARD_INDEX_COUNT,
188-
equalTo(numStandardIndices),
189-
STANDARD_DOCS_COUNT,
190-
equalTo(numStandardDocs),
191-
STANDARD_BYTES_SIZE,
192-
greaterThan(0L),
193-
194-
TIME_SERIES_INDEX_COUNT,
195-
equalTo(numTimeSeriesIndices),
196-
TIME_SERIES_DOCS_COUNT,
197-
equalTo(numTimeSeriesDocs),
198-
TIME_SERIES_BYTES_SIZE,
199-
greaterThan(20L),
200-
201173
LOGSDB_INDEX_COUNT,
202174
equalTo(numLogsdbIndices),
203175
LOGSDB_DOCS_COUNT,
204176
equalTo(numLogsdbDocs),
205177
LOGSDB_BYTES_SIZE,
206-
greaterThan(0L)
178+
greaterThan(0L),
179+
LOGSDB_INDEXING_COUNT,
180+
equalTo(numLogsdbDocs),
181+
LOGSDB_INDEXING_TIME,
182+
greaterThanOrEqualTo(0L),
183+
LOGSDB_INDEXING_FAILURE,
184+
equalTo(indexing3.getIndexFailedCount() - indexing2.getIndexFailedCount())
207185
)
208186
);
209-
// indexing stats
187+
// already collected indexing stats
210188
collectThenAssertMetrics(
211189
telemetry,
212190
4,
213191
Map.of(
214192
STANDARD_INDEXING_COUNT,
215-
equalTo(numStandardDocs),
193+
equalTo(0L),
216194
STANDARD_INDEXING_TIME,
217-
greaterThanOrEqualTo(0L),
195+
equalTo(0L),
218196
STANDARD_INDEXING_FAILURE,
219-
equalTo(indexing1.getIndexFailedCount() - indexing0.getIndexCount()),
197+
equalTo(0L),
220198

221199
TIME_SERIES_INDEXING_COUNT,
222-
equalTo(numTimeSeriesDocs),
200+
equalTo(0L),
223201
TIME_SERIES_INDEXING_TIME,
224-
greaterThanOrEqualTo(0L),
202+
equalTo(0L),
225203
TIME_SERIES_INDEXING_FAILURE,
226-
equalTo(indexing2.getIndexFailedCount() - indexing1.getIndexFailedCount()),
204+
equalTo(0L),
227205

228206
LOGSDB_INDEXING_COUNT,
229-
equalTo(numLogsdbDocs),
207+
equalTo(0L),
230208
LOGSDB_INDEXING_TIME,
231-
greaterThanOrEqualTo(0L),
209+
equalTo(0L),
232210
LOGSDB_INDEXING_FAILURE,
233-
equalTo(indexing3.getIndexFailedCount() - indexing2.getIndexFailedCount())
211+
equalTo(0L)
234212
)
235213
);
236-
telemetry.resetMeter();
237-
214+
String searchNode = internalCluster().startDataOnlyNode();
215+
indicesService = internalCluster().getInstance(IndicesService.class, searchNode);
216+
telemetry = internalCluster().getInstance(PluginsService.class, searchNode)
217+
.filterPlugins(TestTelemetryPlugin.class)
218+
.findFirst()
219+
.orElseThrow();
220+
ensureGreen("st*", "log*", "time*");
238221
// search and fetch
239-
client().prepareSearch("standard*").setSize(100).get().decRef();
240-
var nodeStats1 = indicesService.stats(CommonStatsFlags.ALL, false).getSearch().getTotal();
222+
String preference = "_only_local";
223+
client(searchNode).prepareSearch("standard*").setPreference(preference).setSize(100).get().decRef();
224+
var search1 = indicesService.stats(CommonStatsFlags.ALL, false).getSearch().getTotal();
241225
collectThenAssertMetrics(
242226
telemetry,
243227
1,
244228
Map.of(
245229
STANDARD_QUERY_COUNT,
246230
equalTo(numStandardIndices),
247231
STANDARD_QUERY_TIME,
248-
equalTo(nodeStats1.getQueryTimeInMillis()),
232+
equalTo(search1.getQueryTimeInMillis()),
249233
STANDARD_FETCH_COUNT,
250-
equalTo(nodeStats1.getFetchCount()),
234+
equalTo(search1.getFetchCount()),
251235
STANDARD_FETCH_TIME,
252-
equalTo(nodeStats1.getFetchTimeInMillis()),
236+
equalTo(search1.getFetchTimeInMillis()),
253237

254238
TIME_SERIES_QUERY_COUNT,
255239
equalTo(0L),
@@ -263,67 +247,70 @@ public void testIndicesMetrics() throws Exception {
263247
)
264248
);
265249

266-
client().prepareSearch("time*").setSize(100).get().decRef();
267-
var nodeStats2 = indicesService.stats(CommonStatsFlags.ALL, false).getSearch().getTotal();
250+
client(searchNode).prepareSearch("time*").setPreference(preference).setSize(100).get().decRef();
251+
var search2 = indicesService.stats(CommonStatsFlags.ALL, false).getSearch().getTotal();
268252
collectThenAssertMetrics(
269253
telemetry,
270254
2,
271255
Map.of(
272256
STANDARD_QUERY_COUNT,
273-
equalTo(numStandardIndices),
257+
equalTo(0L),
274258
STANDARD_QUERY_TIME,
275-
equalTo(nodeStats1.getQueryTimeInMillis()),
259+
equalTo(0L),
276260

277261
TIME_SERIES_QUERY_COUNT,
278262
equalTo(numTimeSeriesIndices),
279263
TIME_SERIES_QUERY_TIME,
280-
equalTo(nodeStats2.getQueryTimeInMillis() - nodeStats1.getQueryTimeInMillis()),
264+
equalTo(search2.getQueryTimeInMillis() - search1.getQueryTimeInMillis()),
281265
TIME_SERIES_FETCH_COUNT,
282-
equalTo(nodeStats2.getFetchCount() - nodeStats1.getFetchCount()),
266+
equalTo(search2.getFetchCount() - search1.getFetchCount()),
283267
TIME_SERIES_FETCH_TIME,
284-
equalTo(nodeStats2.getFetchTimeInMillis() - nodeStats1.getFetchTimeInMillis()),
268+
equalTo(search2.getFetchTimeInMillis() - search1.getFetchTimeInMillis()),
285269

286270
LOGSDB_QUERY_COUNT,
287271
equalTo(0L),
288272
LOGSDB_QUERY_TIME,
289273
equalTo(0L)
290274
)
291275
);
292-
client().prepareSearch("logs*").setSize(100).get().decRef();
293-
var nodeStats3 = indicesService.stats(CommonStatsFlags.ALL, false).getSearch().getTotal();
276+
client(searchNode).prepareSearch("logs*").setPreference(preference).setSize(100).get().decRef();
277+
var search3 = indicesService.stats(CommonStatsFlags.ALL, false).getSearch().getTotal();
294278
collectThenAssertMetrics(
295279
telemetry,
296280
3,
297281
Map.of(
298282
STANDARD_QUERY_COUNT,
299-
equalTo(numStandardIndices),
283+
equalTo(0L),
300284
STANDARD_QUERY_TIME,
301-
equalTo(nodeStats1.getQueryTimeInMillis()),
285+
equalTo(0L),
302286

303287
TIME_SERIES_QUERY_COUNT,
304-
equalTo(numTimeSeriesIndices),
288+
equalTo(0L),
305289
TIME_SERIES_QUERY_TIME,
306-
equalTo(nodeStats2.getQueryTimeInMillis() - nodeStats1.getQueryTimeInMillis()),
290+
equalTo(0L),
307291

308292
LOGSDB_QUERY_COUNT,
309293
equalTo(numLogsdbIndices),
310294
LOGSDB_QUERY_TIME,
311-
equalTo(nodeStats3.getQueryTimeInMillis() - nodeStats2.getQueryTimeInMillis()),
295+
equalTo(search3.getQueryTimeInMillis() - search2.getQueryTimeInMillis()),
312296
LOGSDB_FETCH_COUNT,
313-
equalTo(nodeStats3.getFetchCount() - nodeStats2.getFetchCount()),
297+
equalTo(search3.getFetchCount() - search2.getFetchCount()),
314298
LOGSDB_FETCH_TIME,
315-
equalTo(nodeStats3.getFetchTimeInMillis() - nodeStats2.getFetchTimeInMillis())
299+
equalTo(search3.getFetchTimeInMillis() - search2.getFetchTimeInMillis())
316300
)
317301
);
318302
// search failures
319-
expectThrows(Exception.class, () -> { client().prepareSearch("logs*").setRuntimeMappings(parseMapping("""
320-
{
321-
"fail_me": {
322-
"type": "long",
323-
"script": {"source": "<>", "lang": "failing_field"}
303+
expectThrows(
304+
Exception.class,
305+
() -> { client(searchNode).prepareSearch("logs*").setPreference(preference).setRuntimeMappings(parseMapping("""
306+
{
307+
"fail_me": {
308+
"type": "long",
309+
"script": {"source": "<>", "lang": "failing_field"}
310+
}
324311
}
325-
}
326-
""")).setQuery(new RangeQueryBuilder("fail_me").gte(0)).setAllowPartialSearchResults(true).get(); });
312+
""")).setQuery(new RangeQueryBuilder("fail_me").gte(0)).setAllowPartialSearchResults(true).get(); }
313+
);
327314
collectThenAssertMetrics(
328315
telemetry,
329316
4,

0 commit comments

Comments
 (0)