Skip to content

Commit 37ecb22

Browse files
committed
Fix ML tests failing with "no shards available" (elastic#136800)
1 parent 46586ac commit 37ecb22

File tree

7 files changed

+48
-69
lines changed

7 files changed

+48
-69
lines changed

muted-tests.yml

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -164,18 +164,12 @@ tests:
164164
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
165165
method: test {p0=snapshot/10_basic/Create a source only snapshot and then restore it}
166166
issue: https://github.com/elastic/elasticsearch/issues/122755
167-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
168-
method: test {yaml=ml/data_frame_analytics_crud/Test get stats given multiple analytics}
169-
issue: https://github.com/elastic/elasticsearch/issues/123034
170167
- class: org.elasticsearch.indices.recovery.IndexRecoveryIT
171168
method: testSourceThrottling
172169
issue: https://github.com/elastic/elasticsearch/issues/123680
173170
- class: org.elasticsearch.smoketest.MlWithSecurityIT
174171
method: test {yaml=ml/3rd_party_deployment/Test start deployment fails while model download in progress}
175172
issue: https://github.com/elastic/elasticsearch/issues/120814
176-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
177-
method: test {yaml=ml/start_data_frame_analytics/Test start classification analysis when the dependent variable is missing}
178-
issue: https://github.com/elastic/elasticsearch/issues/124168
179173
- class: org.elasticsearch.smoketest.MlWithSecurityIT
180174
method: test {yaml=ml/3rd_party_deployment/Test start and stop multiple deployments}
181175
issue: https://github.com/elastic/elasticsearch/issues/124315
@@ -200,15 +194,6 @@ tests:
200194
- class: org.elasticsearch.index.shard.StoreRecoveryTests
201195
method: testAddIndices
202196
issue: https://github.com/elastic/elasticsearch/issues/124104
203-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
204-
method: test {yaml=ml/data_frame_analytics_crud/Test get stats on newly created config}
205-
issue: https://github.com/elastic/elasticsearch/issues/121726
206-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
207-
method: test {yaml=ml/data_frame_analytics_cat_apis/Test cat data frame analytics all jobs with header and column selection}
208-
issue: https://github.com/elastic/elasticsearch/issues/125641
209-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
210-
method: test {yaml=ml/data_frame_analytics_cat_apis/Test cat data frame analytics single job with header}
211-
issue: https://github.com/elastic/elasticsearch/issues/125642
212197
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
213198
method: test {p0=transform/transforms_start_stop/Test schedule_now on an already started transform}
214199
issue: https://github.com/elastic/elasticsearch/issues/120720
@@ -218,9 +203,6 @@ tests:
218203
- class: org.elasticsearch.xpack.core.common.notifications.AbstractAuditorTests
219204
method: testRecreateTemplateWhenDeleted
220205
issue: https://github.com/elastic/elasticsearch/issues/123232
221-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
222-
method: test {p0=ml/start_data_frame_analytics/Test start given dest index is not empty}
223-
issue: https://github.com/elastic/elasticsearch/issues/125909
224206
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
225207
method: test {p0=transform/transforms_stats/Test get transform stats with timeout}
226208
issue: https://github.com/elastic/elasticsearch/issues/125975
@@ -236,15 +218,6 @@ tests:
236218
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
237219
method: test {p0=transform/transforms_stats/Test get transform stats}
238220
issue: https://github.com/elastic/elasticsearch/issues/126270
239-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
240-
method: test {p0=ml/start_data_frame_analytics/Test start classification analysis when the dependent variable cardinality is too low}
241-
issue: https://github.com/elastic/elasticsearch/issues/126299
242-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
243-
method: test {yaml=ml/start_data_frame_analytics/Test start classification analysis when the dependent variable cardinality is too low}
244-
issue: https://github.com/elastic/elasticsearch/issues/123200
245-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
246-
method: test {yaml=ml/trained_model_cat_apis/Test cat trained models}
247-
issue: https://github.com/elastic/elasticsearch/issues/125750
248221
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
249222
method: test {p0=transform/transforms_start_stop/Test start/stop only starts/stops specified transform}
250223
issue: https://github.com/elastic/elasticsearch/issues/126466
@@ -284,9 +257,6 @@ tests:
284257
- class: org.elasticsearch.xpack.remotecluster.CrossClusterEsqlRCS2EnrichUnavailableRemotesIT
285258
method: testEsqlEnrichWithSkipUnavailable
286259
issue: https://github.com/elastic/elasticsearch/issues/127368
287-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
288-
method: test {p0=ml/data_frame_analytics_cat_apis/Test cat data frame analytics all jobs with header}
289-
issue: https://github.com/elastic/elasticsearch/issues/127625
290260
- class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests
291261
method: testChangeFollowerHistoryUUID
292262
issue: https://github.com/elastic/elasticsearch/issues/127680
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@
2222
import java.util.List;
2323
import java.util.Objects;
2424

25-
public class ResetAuditorAction extends ActionType<ResetAuditorAction.Response> {
25+
public class ResetMlComponentsAction extends ActionType<ResetMlComponentsAction.Response> {
2626

27-
public static final ResetAuditorAction INSTANCE = new ResetAuditorAction();
27+
public static final ResetMlComponentsAction INSTANCE = new ResetMlComponentsAction();
2828
public static final String NAME = "cluster:internal/xpack/ml/auditor/reset";
2929

30-
private ResetAuditorAction() {
30+
private ResetMlComponentsAction() {
3131
super(NAME);
3232
}
3333

x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DeleteExpiredDataIT.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ public void testDeleteExpiredData_GivenNothingToDelete() throws Exception {
105105
client().execute(DeleteExpiredDataAction.INSTANCE, new DeleteExpiredDataAction.Request()).get();
106106
}
107107

108-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62699")
109108
public void testDeleteExpiredDataNoThrottle() throws Exception {
110109
testExpiredDeletion(null, 10010);
111110
}
@@ -152,7 +151,6 @@ public void testDeleteExpiredDataActionDeletesEmptyStateIndices() throws Excepti
152151
);
153152
}
154153

155-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62699")
156154
public void testDeleteExpiredDataWithStandardThrottle() throws Exception {
157155
testExpiredDeletion(-1.0f, 100);
158156
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@
159159
import org.elasticsearch.xpack.core.ml.action.PutTrainedModelAliasAction;
160160
import org.elasticsearch.xpack.core.ml.action.PutTrainedModelDefinitionPartAction;
161161
import org.elasticsearch.xpack.core.ml.action.PutTrainedModelVocabularyAction;
162-
import org.elasticsearch.xpack.core.ml.action.ResetAuditorAction;
163162
import org.elasticsearch.xpack.core.ml.action.ResetJobAction;
163+
import org.elasticsearch.xpack.core.ml.action.ResetMlComponentsAction;
164164
import org.elasticsearch.xpack.core.ml.action.RevertModelSnapshotAction;
165165
import org.elasticsearch.xpack.core.ml.action.SetResetModeAction;
166166
import org.elasticsearch.xpack.core.ml.action.SetUpgradeModeAction;
@@ -270,8 +270,8 @@
270270
import org.elasticsearch.xpack.ml.action.TransportPutTrainedModelAliasAction;
271271
import org.elasticsearch.xpack.ml.action.TransportPutTrainedModelDefinitionPartAction;
272272
import org.elasticsearch.xpack.ml.action.TransportPutTrainedModelVocabularyAction;
273-
import org.elasticsearch.xpack.ml.action.TransportResetAuditorAction;
274273
import org.elasticsearch.xpack.ml.action.TransportResetJobAction;
274+
import org.elasticsearch.xpack.ml.action.TransportResetMlComponentsAction;
275275
import org.elasticsearch.xpack.ml.action.TransportRevertModelSnapshotAction;
276276
import org.elasticsearch.xpack.ml.action.TransportSetResetModeAction;
277277
import org.elasticsearch.xpack.ml.action.TransportSetUpgradeModeAction;
@@ -804,7 +804,7 @@ public void loadExtensions(ExtensionLoader loader) {
804804
private final SetOnce<LearningToRankService> learningToRankService = new SetOnce<>();
805805
private final SetOnce<MlAutoscalingDeciderService> mlAutoscalingDeciderService = new SetOnce<>();
806806
private final SetOnce<DeploymentManager> deploymentManager = new SetOnce<>();
807-
private final SetOnce<TrainedModelAssignmentClusterService> trainedModelAllocationClusterServiceSetOnce = new SetOnce<>();
807+
private final SetOnce<TrainedModelAssignmentClusterService> trainedModelAllocationClusterService = new SetOnce<>();
808808

809809
private final SetOnce<MachineLearningExtension> machineLearningExtension = new SetOnce<>();
810810

@@ -1314,7 +1314,7 @@ public Collection<?> createComponents(PluginServices services) {
13141314
clusterService,
13151315
threadPool
13161316
);
1317-
trainedModelAllocationClusterServiceSetOnce.set(
1317+
trainedModelAllocationClusterService.set(
13181318
new TrainedModelAssignmentClusterService(
13191319
settings,
13201320
clusterService,
@@ -1390,7 +1390,8 @@ public Collection<?> createComponents(PluginServices services) {
13901390
trainedModelCacheMetadataService,
13911391
trainedModelProvider,
13921392
trainedModelAssignmentService,
1393-
trainedModelAllocationClusterServiceSetOnce.get(),
1393+
trainedModelAllocationClusterService.get(),
1394+
trainedModelStatsService,
13941395
deploymentManager.get(),
13951396
nodeAvailabilityZoneMapper,
13961397
new MachineLearningExtensionHolder(machineLearningExtension.get()),
@@ -1563,7 +1564,7 @@ public List<ActionHandler> getActions() {
15631564
actionHandlers.add(new ActionHandler(MlMemoryAction.INSTANCE, TransportMlMemoryAction.class));
15641565
actionHandlers.add(new ActionHandler(SetUpgradeModeAction.INSTANCE, TransportSetUpgradeModeAction.class));
15651566
actionHandlers.add(new ActionHandler(SetResetModeAction.INSTANCE, TransportSetResetModeAction.class));
1566-
actionHandlers.add(new ActionHandler(ResetAuditorAction.INSTANCE, TransportResetAuditorAction.class));
1567+
actionHandlers.add(new ActionHandler(ResetMlComponentsAction.INSTANCE, TransportResetMlComponentsAction.class));
15671568
// Included in this section as it's used by MlMemoryAction
15681569
actionHandlers.add(new ActionHandler(TrainedModelCacheInfoAction.INSTANCE, TransportTrainedModelCacheInfoAction.class));
15691570
actionHandlers.add(new ActionHandler(GetMlAutoscalingStats.INSTANCE, TransportGetMlAutoscalingStats.class));
@@ -2180,17 +2181,17 @@ public void cleanUpFeature(
21802181
});
21812182

21822183
ActionListener<ResetFeatureStateResponse.ResetFeatureStateStatus> resetAuditors = ActionListener.wrap(success -> {
2183-
// reset the auditors as aliases used may be removed
2184+
// reset components, such as the auditors the trained model stats queue
21842185
client.execute(
2185-
ResetAuditorAction.INSTANCE,
2186-
ResetAuditorAction.Request.RESET_AUDITOR_REQUEST,
2186+
ResetMlComponentsAction.INSTANCE,
2187+
ResetMlComponentsAction.Request.RESET_AUDITOR_REQUEST,
21872188
ActionListener.wrap(ignored -> unsetResetModeListener.onResponse(success), unsetResetModeListener::onFailure)
21882189
);
21892190
}, failure -> {
21902191
logger.error("failed to reset machine learning", failure);
21912192
client.execute(
2192-
ResetAuditorAction.INSTANCE,
2193-
ResetAuditorAction.Request.RESET_AUDITOR_REQUEST,
2193+
ResetMlComponentsAction.INSTANCE,
2194+
ResetMlComponentsAction.Request.RESET_AUDITOR_REQUEST,
21942195
ActionListener.wrap(ignored -> unsetResetModeListener.onFailure(failure), unsetResetModeListener::onFailure)
21952196
);
21962197
});
@@ -2325,11 +2326,11 @@ public void cleanUpFeature(
23252326
);
23262327
client.execute(CancelJobModelSnapshotUpgradeAction.INSTANCE, cancelSnapshotUpgradesReq, delegate);
23272328
}).delegateFailureAndWrap((delegate, acknowledgedResponse) -> {
2328-
if (trainedModelAllocationClusterServiceSetOnce.get() == null || machineLearningExtension.get().isNlpEnabled() == false) {
2329+
if (trainedModelAllocationClusterService.get() == null || machineLearningExtension.get().isNlpEnabled() == false) {
23292330
delegate.onResponse(AcknowledgedResponse.TRUE);
23302331
return;
23312332
}
2332-
trainedModelAllocationClusterServiceSetOnce.get().removeAllModelAssignments(delegate);
2333+
trainedModelAllocationClusterService.get().removeAllModelAssignments(delegate);
23332334
});
23342335

23352336
// validate no pipelines are using machine learning models

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDataFrameAnalyticsStatsAction.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.elasticsearch.xpack.ml.utils.persistence.MlParserUtils;
6262

6363
import java.util.ArrayList;
64+
import java.util.Arrays;
6465
import java.util.Collections;
6566
import java.util.Comparator;
6667
import java.util.List;
@@ -278,7 +279,7 @@ private void searchStats(DataFrameAnalyticsConfig config, TaskId parentTaskId, A
278279
() -> format(
279280
"[%s] Item failure encountered during multi search for request [indices=%s, source=%s]: %s",
280281
config.getId(),
281-
itemRequest.indices(),
282+
Arrays.toString(itemRequest.indices()),
282283
itemRequest.source(),
283284
itemResponse.getFailureMessage()
284285
),
Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,72 +17,77 @@
1717
import org.elasticsearch.tasks.Task;
1818
import org.elasticsearch.threadpool.ThreadPool;
1919
import org.elasticsearch.transport.TransportService;
20-
import org.elasticsearch.xpack.core.ml.action.ResetAuditorAction;
20+
import org.elasticsearch.xpack.core.ml.action.ResetMlComponentsAction;
21+
import org.elasticsearch.xpack.ml.inference.TrainedModelStatsService;
2122
import org.elasticsearch.xpack.ml.notifications.AnomalyDetectionAuditor;
2223
import org.elasticsearch.xpack.ml.notifications.DataFrameAnalyticsAuditor;
2324
import org.elasticsearch.xpack.ml.notifications.InferenceAuditor;
2425

2526
import java.io.IOException;
2627
import java.util.List;
2728

28-
public class TransportResetAuditorAction extends TransportNodesAction<
29-
ResetAuditorAction.Request,
30-
ResetAuditorAction.Response,
31-
ResetAuditorAction.NodeRequest,
32-
ResetAuditorAction.Response.ResetResponse,
29+
public class TransportResetMlComponentsAction extends TransportNodesAction<
30+
ResetMlComponentsAction.Request,
31+
ResetMlComponentsAction.Response,
32+
ResetMlComponentsAction.NodeRequest,
33+
ResetMlComponentsAction.Response.ResetResponse,
3334
Void> {
3435

3536
private final AnomalyDetectionAuditor anomalyDetectionAuditor;
3637
private final DataFrameAnalyticsAuditor dfaAuditor;
3738
private final InferenceAuditor inferenceAuditor;
39+
private final TrainedModelStatsService trainedModelStatsService;
3840

3941
@Inject
40-
public TransportResetAuditorAction(
42+
public TransportResetMlComponentsAction(
4143
ThreadPool threadPool,
4244
ClusterService clusterService,
4345
TransportService transportService,
4446
ActionFilters actionFilters,
4547
AnomalyDetectionAuditor anomalyDetectionAuditor,
4648
DataFrameAnalyticsAuditor dfaAuditor,
47-
InferenceAuditor inferenceAuditor
49+
InferenceAuditor inferenceAuditor,
50+
TrainedModelStatsService trainedModelStatsService
4851
) {
4952
super(
50-
ResetAuditorAction.NAME,
53+
ResetMlComponentsAction.NAME,
5154
clusterService,
5255
transportService,
5356
actionFilters,
54-
ResetAuditorAction.NodeRequest::new,
57+
ResetMlComponentsAction.NodeRequest::new,
5558
threadPool.executor(ThreadPool.Names.MANAGEMENT)
5659
);
5760
this.anomalyDetectionAuditor = anomalyDetectionAuditor;
5861
this.dfaAuditor = dfaAuditor;
5962
this.inferenceAuditor = inferenceAuditor;
63+
this.trainedModelStatsService = trainedModelStatsService;
6064
}
6165

6266
@Override
63-
protected ResetAuditorAction.Response newResponse(
64-
ResetAuditorAction.Request request,
65-
List<ResetAuditorAction.Response.ResetResponse> resetResponses,
67+
protected ResetMlComponentsAction.Response newResponse(
68+
ResetMlComponentsAction.Request request,
69+
List<ResetMlComponentsAction.Response.ResetResponse> resetResponses,
6670
List<FailedNodeException> failures
6771
) {
68-
return new ResetAuditorAction.Response(clusterService.getClusterName(), resetResponses, failures);
72+
return new ResetMlComponentsAction.Response(clusterService.getClusterName(), resetResponses, failures);
6973
}
7074

7175
@Override
72-
protected ResetAuditorAction.NodeRequest newNodeRequest(ResetAuditorAction.Request request) {
73-
return new ResetAuditorAction.NodeRequest();
76+
protected ResetMlComponentsAction.NodeRequest newNodeRequest(ResetMlComponentsAction.Request request) {
77+
return new ResetMlComponentsAction.NodeRequest();
7478
}
7579

7680
@Override
77-
protected ResetAuditorAction.Response.ResetResponse newNodeResponse(StreamInput in, DiscoveryNode node) throws IOException {
78-
return new ResetAuditorAction.Response.ResetResponse(in);
81+
protected ResetMlComponentsAction.Response.ResetResponse newNodeResponse(StreamInput in, DiscoveryNode node) throws IOException {
82+
return new ResetMlComponentsAction.Response.ResetResponse(in);
7983
}
8084

8185
@Override
82-
protected ResetAuditorAction.Response.ResetResponse nodeOperation(ResetAuditorAction.NodeRequest request, Task task) {
86+
protected ResetMlComponentsAction.Response.ResetResponse nodeOperation(ResetMlComponentsAction.NodeRequest request, Task task) {
8387
anomalyDetectionAuditor.reset();
8488
dfaAuditor.reset();
8589
inferenceAuditor.reset();
86-
return new ResetAuditorAction.Response.ResetResponse(clusterService.localNode(), true);
90+
trainedModelStatsService.clearQueue();
91+
return new ResetMlComponentsAction.Response.ResetResponse(clusterService.localNode(), true);
8792
}
8893
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/TrainedModelStatsService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,4 +295,8 @@ static UpdateRequest buildUpdateRequest(InferenceStats stats) {
295295
}
296296
return null;
297297
}
298+
299+
public void clearQueue() {
300+
statsQueue.clear();
301+
}
298302
}

0 commit comments

Comments
 (0)