Skip to content

Commit 67e4ec9

Browse files
authored
[7.17] [ML] ML stats failures should not stop the usage API working (#91936)
It is possible to meddle with internal ML state such that calls to the ML stats APIs return errors. It is justifiable for these single purpose APIs to return errors when the internal state of ML is corrupted. However, it is undesirable for these low level problems to completely prevent the overall usage API from returning, because then callers cannot find out usage information from any part of the system. This change makes errors in the ML stats APIs non-fatal to the overall response of the usage API. When an ML stats APIs returns an error, the corresponding section of the ML usage information will be blank. Backport of #91917
1 parent 965b9b6 commit 67e4ec9

File tree

4 files changed

+119
-24
lines changed

4 files changed

+119
-24
lines changed

docs/changelog/91917.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 91917
2+
summary: ML stats failures should not stop the usage API working
3+
area: Machine Learning
4+
type: bug
5+
issues:
6+
- 91893

x-pack/plugin/ml/qa/ml-with-security/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ tasks.named("yamlRestTest").configure {
187187
'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid start param',
188188
'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid end param',
189189
'ml/jobs_get_result_overall_buckets/Test overall buckets given bucket_span is smaller than max job bucket_span',
190+
'ml/jobs_get_stats/Test closed results index',
190191
'ml/jobs_get_stats/Test get job stats given missing job',
191192
'ml/jobs_get_stats/Test no exception on get job stats with missing index',
192193
'ml/job_groups/Test put job with empty group',

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningFeatureSet.java

Lines changed: 57 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package org.elasticsearch.xpack.ml;
88

99
import org.apache.logging.log4j.LogManager;
10+
import org.apache.logging.log4j.Logger;
1011
import org.apache.lucene.util.Constants;
1112
import org.apache.lucene.util.Counter;
1213
import org.elasticsearch.ElasticsearchException;
@@ -73,6 +74,8 @@
7374

7475
public class MachineLearningFeatureSet implements XPackFeatureSet {
7576

77+
private static final Logger logger = LogManager.getLogger(MachineLearningFeatureSet.class);
78+
7679
/**
7780
* List of platforms for which the native processes are available
7881
*/
@@ -368,58 +371,88 @@ public void execute(ActionListener<Usage> listener) {
368371
nodeCount
369372
);
370373
listener.onResponse(usage);
371-
}, listener::onFailure);
374+
}, e -> {
375+
logger.warn("Failed to get trained models usage to include in ML usage", e);
376+
MachineLearningFeatureSetUsage usage = new MachineLearningFeatureSetUsage(
377+
available,
378+
enabled,
379+
jobsUsage,
380+
datafeedsUsage,
381+
analyticsUsage,
382+
inferenceUsage,
383+
nodeCount
384+
);
385+
listener.onResponse(usage);
386+
});
372387

373388
// Step 5. Extract usage from ingest statistics and gather trained model config count
389+
GetTrainedModelsAction.Request getModelsRequest = new GetTrainedModelsAction.Request(
390+
"*",
391+
Collections.emptyList(),
392+
Collections.emptySet()
393+
);
394+
getModelsRequest.setPageParams(new PageParams(0, 10_000));
374395
ActionListener<NodesStatsResponse> nodesStatsListener = ActionListener.wrap(response -> {
375396
addInferenceIngestUsage(response, inferenceUsage);
376-
GetTrainedModelsAction.Request getModelsRequest = new GetTrainedModelsAction.Request(
377-
"*",
378-
Collections.emptyList(),
379-
Collections.emptySet()
380-
);
381-
getModelsRequest.setPageParams(new PageParams(0, 10_000));
382397
client.execute(GetTrainedModelsAction.INSTANCE, getModelsRequest, trainedModelsListener);
383-
}, listener::onFailure);
398+
}, e -> {
399+
logger.warn("Failed to get inference ingest usage to include in ML usage", e);
400+
client.execute(GetTrainedModelsAction.INSTANCE, getModelsRequest, trainedModelsListener);
401+
});
384402

385403
// Step 4. Extract usage from data frame analytics configs and then request ingest node stats
404+
String[] ingestNodes = ingestNodes(state);
405+
NodesStatsRequest nodesStatsRequest = new NodesStatsRequest(ingestNodes).clear()
406+
.addMetric(NodesStatsRequest.Metric.INGEST.metricName());
386407
ActionListener<GetDataFrameAnalyticsAction.Response> dataframeAnalyticsListener = ActionListener.wrap(response -> {
387408
addDataFrameAnalyticsUsage(response, analyticsUsage);
388-
String[] ingestNodes = ingestNodes(state);
389-
NodesStatsRequest nodesStatsRequest = new NodesStatsRequest(ingestNodes).clear()
390-
.addMetric(NodesStatsRequest.Metric.INGEST.metricName());
391409
client.execute(NodesStatsAction.INSTANCE, nodesStatsRequest, nodesStatsListener);
392-
}, listener::onFailure);
410+
}, e -> {
411+
logger.warn("Failed to get data frame analytics configs to include in ML usage", e);
412+
client.execute(NodesStatsAction.INSTANCE, nodesStatsRequest, nodesStatsListener);
413+
});
393414

394415
// Step 3. Extract usage from data frame analytics stats and then request data frame analytics configs
416+
GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
417+
getDfaRequest.setPageParams(new PageParams(0, 10_000));
395418
ActionListener<GetDataFrameAnalyticsStatsAction.Response> dataframeAnalyticsStatsListener = ActionListener.wrap(response -> {
396419
addDataFrameAnalyticsStatsUsage(response, analyticsUsage);
397-
GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
398-
getDfaRequest.setPageParams(new PageParams(0, 10_000));
399420
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
400-
}, listener::onFailure);
421+
}, e -> {
422+
logger.warn("Failed to get data frame analytics stats to include in ML usage", e);
423+
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
424+
});
401425

402426
// Step 2. Extract usage from datafeeds stats and return usage response
427+
GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
428+
GetDatafeedsStatsAction.ALL
429+
);
430+
dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
403431
ActionListener<GetDatafeedsStatsAction.Response> datafeedStatsListener = ActionListener.wrap(response -> {
404432
addDatafeedsUsage(response);
405-
GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
406-
GetDatafeedsStatsAction.ALL
407-
);
408-
dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
409433
client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
410-
}, listener::onFailure);
434+
}, e -> {
435+
logger.warn("Failed to get datafeed stats to include in ML usage", e);
436+
client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
437+
});
411438

412439
// Step 1. Extract usage from jobs stats and then request stats for all datafeeds
413-
GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
440+
GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(GetDatafeedsStatsAction.ALL);
414441
ActionListener<GetJobsStatsAction.Response> jobStatsListener = ActionListener.wrap(response -> {
415442
jobManagerHolder.getJobManager().expandJobs(Metadata.ALL, true, ActionListener.wrap(jobs -> {
416443
addJobsUsage(response, jobs.results());
417-
GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(GetDatafeedsStatsAction.ALL);
418444
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
419-
}, listener::onFailure));
420-
}, listener::onFailure);
445+
}, e -> {
446+
logger.warn("Failed to get job configs to include in ML usage", e);
447+
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
448+
}));
449+
}, e -> {
450+
logger.warn("Failed to get job stats to include in ML usage", e);
451+
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
452+
});
421453

422454
// Step 0. Kick off the chain of callbacks by requesting jobs stats
455+
GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
423456
client.execute(GetJobsStatsAction.INSTANCE, jobStatsRequest, jobStatsListener);
424457
}
425458

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,3 +413,58 @@ setup:
413413
- is_false: jobs.1.timing_stats.maximum_bucket_processing_time_ms
414414
- is_false: jobs.1.timing_stats.average_bucket_processing_time_ms
415415
- is_false: jobs.1.timing_stats.exponential_average_bucket_processing_time_ms
416+
417+
---
418+
"Test closed results index":
419+
420+
- skip:
421+
features:
422+
- "warnings"
423+
424+
- do:
425+
warnings:
426+
- 'Posting data directly to anomaly detection jobs is deprecated, in a future major version it will be compulsory to use a datafeed'
427+
ml.post_data:
428+
job_id: job-stats-test
429+
body: >
430+
{"airline":"AAL","responsetime":"132.2046","time":"1403481600"}
431+
{"airline":"JZA","responsetime":"990.4628","time":"1403481600"}
432+
433+
- do:
434+
ml.close_job:
435+
job_id: jobs-get-stats-datafeed-job
436+
- match: { closed: true }
437+
438+
- do:
439+
ml.close_job:
440+
job_id: job-stats-test
441+
- match: { closed: true }
442+
443+
- do:
444+
ml.get_job_stats: {}
445+
- length: { jobs : 2 }
446+
447+
- do:
448+
xpack.usage: {}
449+
- match: { ml.available: true }
450+
- match: { ml.enabled: true }
451+
- match: { ml.jobs.closed.count: 2 }
452+
453+
- do:
454+
indices.close:
455+
index: .ml-anomalies-shared
456+
wait_for_active_shards: index-setting
457+
458+
# With the index closed the low level ML API reports a problem
459+
- do:
460+
catch: /type=cluster_block_exception, reason=index \[.ml-anomalies-shared\] blocked by. \[FORBIDDEN\/.\/index closed\]/
461+
ml.get_job_stats: {}
462+
463+
# But the high level X-Pack API returns what it can - we do this
464+
# so that corruption to ML doesn't blind observers of the general
465+
# cluster status
466+
- do:
467+
xpack.usage: {}
468+
- match: { ml.available: true }
469+
- match: { ml.enabled: true }
470+
- is_false: ml.jobs.closed.count

0 commit comments

Comments
 (0)