Skip to content

Commit 28fcdb9

Browse files
authored
[8.5] [ML] ML stats failures should not stop the usage API working (#91933)
It is possible to meddle with internal ML state such that calls to the ML stats APIs return errors. It is justifiable for these single purpose APIs to return errors when the internal state of ML is corrupted. However, it is undesirable for these low level problems to completely prevent the overall usage API from returning, because then callers cannot find out usage information from any part of the system. This change makes errors in the ML stats APIs non-fatal to the overall response of the usage API. When an ML stats APIs returns an error, the corresponding section of the ML usage information will be blank. Backport of #91917
1 parent 379ac6d commit 28fcdb9

File tree

4 files changed

+113
-17
lines changed

4 files changed

+113
-17
lines changed

docs/changelog/91917.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 91917
2+
summary: ML stats failures should not stop the usage API working
3+
area: Machine Learning
4+
type: bug
5+
issues:
6+
- 91893

x-pack/plugin/ml/qa/ml-with-security/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ tasks.named("yamlRestTest").configure {
195195
'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid start param',
196196
'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid end param',
197197
'ml/jobs_get_result_overall_buckets/Test overall buckets given bucket_span is smaller than max job bucket_span',
198+
'ml/jobs_get_stats/Test closed results index',
198199
'ml/jobs_get_stats/Test get job stats given missing job',
199200
'ml/jobs_get_stats/Test no exception on get job stats with missing index',
200201
'ml/job_groups/Test put job with empty group',

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import org.elasticsearch.common.util.Maps;
2121
import org.elasticsearch.env.Environment;
2222
import org.elasticsearch.license.XPackLicenseState;
23+
import org.elasticsearch.logging.LogManager;
24+
import org.elasticsearch.logging.Logger;
2325
import org.elasticsearch.protocol.xpack.XPackUsageRequest;
2426
import org.elasticsearch.tasks.Task;
2527
import org.elasticsearch.threadpool.ThreadPool;
@@ -66,6 +68,8 @@
6668

6769
public class MachineLearningUsageTransportAction extends XPackUsageFeatureTransportAction {
6870

71+
private static final Logger logger = LogManager.getLogger(MachineLearningUsageTransportAction.class);
72+
6973
private final Client client;
7074
private final XPackLicenseState licenseState;
7175
private final JobManagerHolder jobManagerHolder;
@@ -124,8 +128,8 @@ protected void masterOperation(
124128
int nodeCount = mlNodeCount(state);
125129

126130
// Step 5. return final ML usage
127-
ActionListener<Map<String, Object>> inferenceUsageListener = ActionListener.wrap(inferenceUsage -> {
128-
listener.onResponse(
131+
ActionListener<Map<String, Object>> inferenceUsageListener = ActionListener.wrap(
132+
inferenceUsage -> listener.onResponse(
129133
new XPackUsageFeatureResponse(
130134
new MachineLearningFeatureSetUsage(
131135
MachineLearningField.ML_API_FEATURE.checkWithoutTracking(licenseState),
@@ -137,45 +141,76 @@ protected void masterOperation(
137141
nodeCount
138142
)
139143
)
140-
);
141-
}, listener::onFailure);
144+
),
145+
e -> {
146+
logger.warn("Failed to get inference usage to include in ML usage", e);
147+
listener.onResponse(
148+
new XPackUsageFeatureResponse(
149+
new MachineLearningFeatureSetUsage(
150+
MachineLearningField.ML_API_FEATURE.checkWithoutTracking(licenseState),
151+
enabled,
152+
jobsUsage,
153+
datafeedsUsage,
154+
analyticsUsage,
155+
Collections.emptyMap(),
156+
nodeCount
157+
)
158+
)
159+
);
160+
}
161+
);
142162

143163
// Step 4. Extract usage from data frame analytics configs and then get inference usage
144164
ActionListener<GetDataFrameAnalyticsAction.Response> dataframeAnalyticsListener = ActionListener.wrap(response -> {
145165
addDataFrameAnalyticsUsage(response, analyticsUsage);
146166
addInferenceUsage(inferenceUsageListener);
147-
}, listener::onFailure);
167+
}, e -> {
168+
logger.warn("Failed to get data frame analytics configs to include in ML usage", e);
169+
addInferenceUsage(inferenceUsageListener);
170+
});
148171

149172
// Step 3. Extract usage from data frame analytics stats and then request data frame analytics configs
173+
GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
174+
getDfaRequest.setPageParams(new PageParams(0, 10_000));
150175
ActionListener<GetDataFrameAnalyticsStatsAction.Response> dataframeAnalyticsStatsListener = ActionListener.wrap(response -> {
151176
addDataFrameAnalyticsStatsUsage(response, analyticsUsage);
152-
GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
153-
getDfaRequest.setPageParams(new PageParams(0, 10_000));
154177
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
155-
}, listener::onFailure);
178+
}, e -> {
179+
logger.warn("Failed to get data frame analytics stats to include in ML usage", e);
180+
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
181+
});
156182

157183
// Step 2. Extract usage from datafeeds stats and then request stats for data frame analytics
184+
GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
185+
Metadata.ALL
186+
);
187+
dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
158188
ActionListener<GetDatafeedsStatsAction.Response> datafeedStatsListener = ActionListener.wrap(response -> {
159189
addDatafeedsUsage(response, datafeedsUsage);
160-
GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
161-
Metadata.ALL
162-
);
163-
dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
164190
client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
165-
}, listener::onFailure);
191+
}, e -> {
192+
logger.warn("Failed to get datafeed stats to include in ML usage", e);
193+
client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
194+
});
166195

167196
// Step 1. Extract usage from jobs stats and then request stats for all datafeeds
168-
GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
197+
GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(Metadata.ALL);
169198
ActionListener<GetJobsStatsAction.Response> jobStatsListener = ActionListener.wrap(
170199
response -> jobManagerHolder.getJobManager().expandJobs(Metadata.ALL, true, ActionListener.wrap(jobs -> {
171200
addJobsUsage(response, jobs.results(), jobsUsage);
172-
GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(Metadata.ALL);
173201
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
174-
}, listener::onFailure)),
175-
listener::onFailure
202+
}, e -> {
203+
logger.warn("Failed to get job configs to include in ML usage", e);
204+
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
205+
})),
206+
e -> {
207+
logger.warn("Failed to get job stats to include in ML usage", e);
208+
client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
209+
}
176210
);
177211

178212
// Step 0. Kick off the chain of callbacks by requesting jobs stats
213+
GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
179214
client.execute(GetJobsStatsAction.INSTANCE, jobStatsRequest, jobStatsListener);
180215
}
181216

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,3 +397,57 @@ setup:
397397
- is_false: jobs.1.timing_stats.maximum_bucket_processing_time_ms
398398
- is_false: jobs.1.timing_stats.average_bucket_processing_time_ms
399399
- is_false: jobs.1.timing_stats.exponential_average_bucket_processing_time_ms
400+
401+
---
402+
"Test closed results index":
403+
404+
- skip:
405+
features:
406+
- "warnings"
407+
408+
- do:
409+
warnings:
410+
- 'Posting data directly to anomaly detection jobs is deprecated, in a future major version it will be compulsory to use a datafeed'
411+
ml.post_data:
412+
job_id: job-stats-test
413+
body: >
414+
{"airline":"AAL","responsetime":"132.2046","time":"1403481600"}
415+
{"airline":"JZA","responsetime":"990.4628","time":"1403481600"}
416+
417+
- do:
418+
ml.close_job:
419+
job_id: jobs-get-stats-datafeed-job
420+
- match: { closed: true }
421+
422+
- do:
423+
ml.close_job:
424+
job_id: job-stats-test
425+
- match: { closed: true }
426+
427+
- do:
428+
ml.get_job_stats: {}
429+
- length: { jobs : 2 }
430+
431+
- do:
432+
xpack.usage: {}
433+
- match: { ml.available: true }
434+
- match: { ml.enabled: true }
435+
- match: { ml.jobs.closed.count: 2 }
436+
437+
- do:
438+
indices.close:
439+
index: .ml-anomalies-shared
440+
441+
# With the index closed the low level ML API reports a problem
442+
- do:
443+
catch: /type=cluster_block_exception, reason=index \[.ml-anomalies-shared\] blocked by. \[FORBIDDEN\/.\/index closed\]/
444+
ml.get_job_stats: {}
445+
446+
# But the high level X-Pack API returns what it can - we do this
447+
# so that corruption to ML doesn't blind observers of the general
448+
# cluster status
449+
- do:
450+
xpack.usage: {}
451+
- match: { ml.available: true }
452+
- match: { ml.enabled: true }
453+
- is_false: ml.jobs.closed.count

0 commit comments

Comments
 (0)