Skip to content

Commit e5c91ca

Browse files
authored
[ML] Fix model assignment error handling and assignment explanation generation (#133916)
This PR ensures that in case of insufficient memory on a node, the internal IllegalArgumentException from assignModelToNode is not leaked to the upper layers of the architecture by first checking that the model can be assigned to the node. The check canAssign() is now moved into the function assignModelToNode().
1 parent bc43a32 commit e5c91ca

File tree

7 files changed

+23
-101
lines changed

7 files changed

+23
-101
lines changed

docs/changelog/133916.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 133916
2+
summary: Fix model assignment error handling and assignment explanation generation
3+
area: Machine Learning
4+
type: bug
5+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/AbstractPreserveAllocations.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,7 @@ AssignmentPlan mergePreservedAllocations(AssignmentPlan assignmentPlan) {
103103
0
104104
);
105105

106-
long requiredMemory = mergedPlanBuilder.getDeploymentMemoryRequirement(deploymentNewAllocations, n, newAllocations);
107-
if (newAllocations > 0 && mergedPlanBuilder.canAssign(deploymentNewAllocations, n, newAllocations, requiredMemory)) {
106+
if (newAllocations > 0) {
108107
mergedPlanBuilder.assignModelToNode(deploymentNewAllocations, n, newAllocations);
109108
}
110109
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/AssignmentPlan.java

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -441,33 +441,9 @@ public Builder assignModelToNode(Deployment deployment, Node node, int allocatio
441441
}
442442

443443
public Builder assignModelToNode(Deployment deployment, Node node, int allocations, long requiredMemory) {
444-
if (allocations <= 0) {
444+
if (allocations <= 0 || canAssign(deployment, node, allocations, requiredMemory) == false) {
445445
return this;
446446
}
447-
if (requiredMemory > remainingNodeMemory.get(node)) {
448-
throw new IllegalArgumentException(
449-
"not enough memory on node ["
450-
+ node.id()
451-
+ "] to assign ["
452-
+ allocations
453-
+ "] allocations to deployment ["
454-
+ deployment.deploymentId()
455-
+ "]"
456-
);
457-
}
458-
if (deployment.priority == Priority.NORMAL && allocations * deployment.threadsPerAllocation() > remainingNodeCores.get(node)) {
459-
throw new IllegalArgumentException(
460-
"not enough cores on node ["
461-
+ node.id()
462-
+ "] to assign ["
463-
+ allocations
464-
+ "] allocations to deployment ["
465-
+ deployment.deploymentId()
466-
+ "]; required threads per allocation ["
467-
+ deployment.threadsPerAllocation()
468-
+ "]"
469-
);
470-
}
471447

472448
assignments.get(deployment).compute(node, (n, assignedAllocations) -> assignedAllocations + allocations);
473449
accountMemory(deployment, node, requiredMemory);

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/RandomizedAssignmentRounding.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -310,11 +310,7 @@ private void unassignOversizedModels(Node n) {
310310
private AssignmentPlan toPlan() {
311311
AssignmentPlan.Builder builder = AssignmentPlan.builder(nodes, deployments);
312312
for (Map.Entry<Tuple<AssignmentPlan.Deployment, Node>, Integer> assignment : tryAssigningRemainingCores().entrySet()) {
313-
// TODO (#101612) The model should be assigned to the node only when it is possible. This means, that canAssign should be
314-
// integrated into the assignModelToNode.
315-
if (builder.canAssign(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue())) {
316-
builder.assignModelToNode(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue());
317-
}
313+
builder.assignModelToNode(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue());
318314
}
319315
return builder.build();
320316
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/ZoneAwareAssignmentPlanner.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ private AssignmentPlan computePlan(boolean tryAssigningPreviouslyAssignedModels)
9191
remainingZones,
9292
tryAssigningPreviouslyAssignedModels
9393
);
94+
95+
// Update remaining allocations to account for allocations satisfied in this zone
9496
plan.deployments()
9597
.forEach(
9698
d -> deploymentIdToRemainingAllocations.computeIfPresent(
@@ -217,6 +219,14 @@ private AssignmentPlan swapOriginalDeploymentsInPlan(
217219
return finalPlanBuilder.build();
218220
}
219221

222+
/**
223+
* The mergeAllocationsByNodeIdByDeploymentId method is responsible for consolidating allocation data
224+
* from multiple AssignmentPlan objects into a single structure. This structure maps deployment IDs
225+
* to their respective node allocations, allowing the system to track how resources are distributed
226+
* across nodes for each deployment.
227+
* @param plans List of AssignmentPlan objects to merge allocations from
228+
* @return
229+
*/
220230
private Map<String, Map<String, Integer>> mergeAllocationsByNodeIdByDeploymentId(List<AssignmentPlan> plans) {
221231
Map<String, Map<String, Integer>> allocationsByNodeIdByDeploymentId = new HashMap<>();
222232
deployments.forEach(d -> allocationsByNodeIdByDeploymentId.put(d.deploymentId(), new HashMap<>()));

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentRebalancerTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1198,7 +1198,7 @@ public void testCopyAssignments() {
11981198
assertThat(deployment2Assignments.get().get(node2), equalTo(1));
11991199
}
12001200

1201-
public void testRebalance_GivenDeploymentWithMemoryRequirements_ConsidersNativeExecutableOverhead() {
1201+
public void testRebalance_GivenDeploymentWithMemoryRequirements_ExplainMissingAllocations() {
12021202
// Create a node with just enough memory to fit the model plus native executable overhead
12031203
long modelMemory = ByteSizeValue.ofMb(200).getBytes();
12041204
long memoryOverhead = ByteSizeValue.ofMb(240).getBytes();

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/assignment/planning/AssignmentPlanTests.java

Lines changed: 4 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import java.util.Map;
1717

1818
import static org.hamcrest.Matchers.contains;
19-
import static org.hamcrest.Matchers.containsString;
2019
import static org.hamcrest.Matchers.equalTo;
2120
import static org.hamcrest.Matchers.greaterThan;
2221
import static org.hamcrest.Matchers.is;
@@ -233,73 +232,15 @@ public void testAssignModelToNode_GivenNewPlanDoesNotSatisfyCurrentAssignment()
233232
}
234233
}
235234

236-
public void testAssignModelToNode_GivenPreviouslyUnassignedModelDoesNotFit() {
237-
Node n = new Node("n_1", ByteSizeValue.ofMb(340 - 1).getBytes(), 4);
238-
Deployment m = new AssignmentPlan.Deployment("m_1", "m_1", ByteSizeValue.ofMb(50).getBytes(), 2, 2, Map.of(), 0, null, 0, 0);
239-
240-
AssignmentPlan.Builder builder = AssignmentPlan.builder(List.of(n), List.of(m));
241-
Exception e = expectThrows(IllegalArgumentException.class, () -> builder.assignModelToNode(m, n, 1));
242-
243-
assertThat(e.getMessage(), equalTo("not enough memory on node [n_1] to assign [1] allocations to deployment [m_1]"));
244-
}
245-
246-
public void testAssignModelToNode_GivenPreviouslyAssignedModelDoesNotFit() {
247-
{ // old memory format
248-
Node n = new Node("n_1", ByteSizeValue.ofMb(340 - 1).getBytes(), 4);
249-
AssignmentPlan.Deployment m = new AssignmentPlan.Deployment(
250-
"m_1",
251-
"m_1",
252-
ByteSizeValue.ofMb(50).getBytes(),
253-
2,
254-
2,
255-
Map.of("n_1", 1),
256-
0,
257-
null,
258-
0,
259-
0
260-
);
261-
262-
AssignmentPlan.Builder builder = AssignmentPlan.builder(List.of(n), List.of(m));
263-
264-
Exception e = expectThrows(IllegalArgumentException.class, () -> builder.assignModelToNode(m, n, 2));
265-
assertThat(e.getMessage(), containsString("not enough memory on node"));
266-
}
267-
{ // new memory format
268-
Node n = new Node("n_1", ByteSizeValue.ofMb(340 - 1).getBytes(), 4);
269-
AssignmentPlan.Deployment m = new AssignmentPlan.Deployment(
270-
"m_1",
271-
"m_1",
272-
ByteSizeValue.ofMb(30).getBytes(),
273-
2,
274-
2,
275-
Map.of("n_1", 1),
276-
0,
277-
null,
278-
ByteSizeValue.ofMb(300).getBytes(),
279-
ByteSizeValue.ofMb(5).getBytes()
280-
);
281-
282-
AssignmentPlan.Builder builder = AssignmentPlan.builder(List.of(n), List.of(m));
283-
284-
Exception e = expectThrows(IllegalArgumentException.class, () -> builder.assignModelToNode(m, n, 2));
285-
assertThat(e.getMessage(), containsString("not enough memory on node"));
286-
}
287-
}
288-
289-
public void testAssignModelToNode_GivenNotEnoughCores_AndSingleThreadPerAllocation() {
235+
public void testCanAssign_GivenNotEnoughCores_AndSingleThreadPerAllocation() {
290236
Node n = new Node("n_1", ByteSizeValue.ofMb(500).getBytes(), 4);
291237
Deployment m = new AssignmentPlan.Deployment("m_1", "m_1", ByteSizeValue.ofMb(100).getBytes(), 5, 1, Map.of(), 0, null, 0, 0);
292238

293239
AssignmentPlan.Builder builder = AssignmentPlan.builder(List.of(n), List.of(m));
294-
Exception e = expectThrows(IllegalArgumentException.class, () -> builder.assignModelToNode(m, n, 5));
295-
296-
assertThat(
297-
e.getMessage(),
298-
equalTo("not enough cores on node [n_1] to assign [5] allocations to deployment [m_1]; required threads per allocation [1]")
299-
);
240+
assertThat(builder.canAssign(m, n, 5), is(false));
300241
}
301242

302-
public void testAssignModelToNode_GivenNotEnoughCores_AndMultipleThreadsPerAllocation() {
243+
public void testCanAssign_GivenNotEnoughCores_AndMultipleThreadsPerAllocation() {
303244
Node n = new Node("n_1", ByteSizeValue.ofMb(500).getBytes(), 5);
304245
AssignmentPlan.Deployment m = new AssignmentPlan.Deployment(
305246
"m_1",
@@ -315,12 +256,7 @@ public void testAssignModelToNode_GivenNotEnoughCores_AndMultipleThreadsPerAlloc
315256
);
316257

317258
AssignmentPlan.Builder builder = AssignmentPlan.builder(List.of(n), List.of(m));
318-
Exception e = expectThrows(IllegalArgumentException.class, () -> builder.assignModelToNode(m, n, 3));
319-
320-
assertThat(
321-
e.getMessage(),
322-
equalTo("not enough cores on node [n_1] to assign [3] allocations to deployment [m_1]; required threads per allocation [2]")
323-
);
259+
assertThat(builder.canAssign(m, n, 3), is(false));
324260
}
325261

326262
public void testAssignModelToNode_GivenSameModelAssignedTwice() {

0 commit comments

Comments
 (0)