Skip to content

Commit d8459d1

Browse files
authored
[ML] Fix NPE in Get Deployment Stats (#115404) (#115473)
If a node has been removed from the cluster and the trained model assignment has not been updated the GET stats action can have an inconsistent view where it thinks a model is deployed on the removed node. The bug only affected nodes with failed deployments.
1 parent 67a7682 commit d8459d1

File tree

3 files changed

+45
-1
lines changed

3 files changed

+45
-1
lines changed

docs/changelog/115404.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 115404
2+
summary: Fix NPE in Get Deployment Stats
3+
area: Machine Learning
4+
type: bug
5+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsAction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ static GetDeploymentStatsAction.Response addFailedRoutes(
220220

221221
// add nodes from the failures that were not in the task responses
222222
for (var nodeRoutingState : nodeToRoutingStates.entrySet()) {
223-
if (visitedNodes.contains(nodeRoutingState.getKey()) == false) {
223+
if ((visitedNodes.contains(nodeRoutingState.getKey()) == false) && nodes.nodeExists(nodeRoutingState.getKey())) {
224224
updatedNodeStats.add(
225225
AssignmentStats.NodeStats.forNotStartedState(
226226
nodes.get(nodeRoutingState.getKey()),

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsActionTests.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,45 @@ public void testAddFailedRoutes_TaskResultIsOverwritten() throws UnknownHostExce
148148
assertEquals(RoutingState.FAILED, results.get(0).getNodeStats().get(1).getRoutingState().getState());
149149
}
150150

151+
public void testAddFailedRoutes_MissingNode() throws UnknownHostException {
152+
DiscoveryNodes nodes = buildNodes("node1", "node2");
153+
var missingNode = DiscoveryNodeUtils.create(
154+
"node3",
155+
new TransportAddress(InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 }), 9203)
156+
);
157+
158+
List<AssignmentStats.NodeStats> nodeStatsList = new ArrayList<>();
159+
nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node1")));
160+
nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node2")));
161+
162+
var model1 = new AssignmentStats(
163+
"model1",
164+
"deployment1",
165+
randomBoolean() ? null : randomIntBetween(1, 8),
166+
randomBoolean() ? null : randomIntBetween(1, 8),
167+
null,
168+
randomBoolean() ? null : randomIntBetween(1, 10000),
169+
randomBoolean() ? null : ByteSizeValue.ofBytes(randomLongBetween(1, 1000000)),
170+
Instant.now(),
171+
nodeStatsList,
172+
randomFrom(Priority.values())
173+
);
174+
var response = new GetDeploymentStatsAction.Response(Collections.emptyList(), Collections.emptyList(), List.of(model1), 1);
175+
176+
// failed state for node 3 conflicts
177+
Map<TrainedModelAssignment, Map<String, RoutingInfo>> badRoutes = new HashMap<>();
178+
Map<String, RoutingInfo> nodeRoutes = new HashMap<>();
179+
nodeRoutes.put("node3", new RoutingInfo(1, 1, RoutingState.FAILED, "failed on node3"));
180+
badRoutes.put(createAssignment("model1"), nodeRoutes);
181+
182+
var modified = TransportGetDeploymentStatsAction.addFailedRoutes(response, badRoutes, nodes);
183+
List<AssignmentStats> results = modified.getStats().results();
184+
assertThat(results, hasSize(1));
185+
assertThat(results.get(0).getNodeStats(), hasSize(2)); // 3
186+
assertEquals("node1", results.get(0).getNodeStats().get(0).getNode().getId());
187+
assertEquals("node2", results.get(0).getNodeStats().get(1).getNode().getId());
188+
}
189+
151190
private DiscoveryNodes buildNodes(String... nodeIds) throws UnknownHostException {
152191
InetAddress inetAddress = InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 });
153192
DiscoveryNodes.Builder builder = DiscoveryNodes.builder();

0 commit comments

Comments
 (0)