Skip to content

Commit adc1186

Browse files
authored
[ML] Prevent the trained model deployment memory estimation from double-counting allocations. (#131990) (#131995)
This reverts commit 971cfb9. The refactoring in 971cfb9 introduced a bug that could potentially lead to double-counting of the number of allocations in the trained model memory estimation.
1 parent 39994fc commit adc1186

File tree

6 files changed

+33
-14
lines changed

6 files changed

+33
-14
lines changed

docs/changelog/131990.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 131990
2+
summary: Prevent the trained model deployment memory estimation from double-counting
3+
allocations
4+
area: Machine Learning
5+
type: bug
6+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentRebalancer.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,13 @@ private static void copyAssignments(
140140
for (Map.Entry<AssignmentPlan.Node, Integer> assignment : nodeAssignments.entrySet()) {
141141
AssignmentPlan.Node originalNode = originalNodeById.get(assignment.getKey().id());
142142
dest.assignModelToNode(m, originalNode, assignment.getValue());
143-
// As the node has all its available memory we need to manually account memory of models with
144-
// current allocations.
145-
dest.accountMemory(m, originalNode);
143+
if (m.currentAllocationsByNodeId().containsKey(originalNode.id())) {
144+
// TODO (#101612) requiredMemory should be calculated by the AssignmentPlan.Builder
145+
// As the node has all its available memory we need to manually account memory of models with
146+
// current allocations.
147+
long requiredMemory = m.estimateMemoryUsageBytes(m.currentAllocationsByNodeId().get(originalNode.id()));
148+
dest.accountMemory(m, originalNode, requiredMemory);
149+
}
146150
}
147151
}
148152
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/AbstractPreserveAllocations.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ Deployment modifyModelPreservingPreviousAssignments(Deployment m) {
7070
AssignmentPlan mergePreservedAllocations(AssignmentPlan assignmentPlan) {
7171
// As the model/node objects the assignment plan are the modified ones,
7272
// they will not match the models/nodes members we have in this class.
73-
// Therefore, we build a lookup table based on the ids, so we can merge the plan
73+
// Therefore, we build a lookup table based on the ids so we can merge the plan
7474
// with its preserved allocations.
7575
final Map<Tuple<String, String>, Integer> plannedAssignmentsByDeploymentNodeIdPair = new HashMap<>();
7676
for (Deployment d : assignmentPlan.deployments()) {
@@ -85,6 +85,7 @@ AssignmentPlan mergePreservedAllocations(AssignmentPlan assignmentPlan) {
8585

8686
AssignmentPlan.Builder mergedPlanBuilder = AssignmentPlan.builder(nodes, deployments);
8787
for (Node n : nodes) {
88+
// TODO (#101612) Should the first loop happen in the builder constructor?
8889
for (Deployment deploymentAllocationsToPreserve : deployments) {
8990

9091
// if the model m is already allocated on the node n and I want to preserve this allocation

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/AssignmentPlan.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,8 @@ public Builder assignModelToNode(Deployment deployment, Node node, int allocatio
424424
if (allocations <= 0) {
425425
return this;
426426
}
427-
if (requiredMemory > remainingNodeMemory.get(node)) {
427+
if (/*isAlreadyAssigned(deployment, node) == false
428+
&&*/ requiredMemory > remainingNodeMemory.get(node)) {
428429
throw new IllegalArgumentException(
429430
"not enough memory on node ["
430431
+ node.id()
@@ -470,14 +471,13 @@ private static int getCurrentAllocations(Deployment m, Node n) {
470471
}
471472

472473
public void accountMemory(Deployment m, Node n) {
473-
if (m.currentAllocationsByNodeId().containsKey(n.id())) {
474-
int allocations = m.currentAllocationsByNodeId().get(n.id());
475-
long requiredMemory = m.estimateMemoryUsageBytes(allocations);
476-
accountMemory(m, n, requiredMemory);
477-
}
474+
// TODO (#101612) remove or refactor unused method
475+
long requiredMemory = getDeploymentMemoryRequirement(m, n, getCurrentAllocations(m, n));
476+
accountMemory(m, n, requiredMemory);
478477
}
479478

480-
private void accountMemory(Deployment m, Node n, long requiredMemory) {
479+
public void accountMemory(Deployment m, Node n, long requiredMemory) {
480+
// TODO (#101612) computation of required memory should be done internally
481481
remainingNodeMemory.computeIfPresent(n, (k, v) -> v - requiredMemory);
482482
if (remainingNodeMemory.containsKey(n) && remainingNodeMemory.get(n) < 0) {
483483
throw new IllegalArgumentException("not enough memory on node [" + n.id() + "] to assign model [" + m.deploymentId() + "]");

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/RandomizedAssignmentRounding.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,8 @@ private void unassignOversizedModels(Node n) {
310310
private AssignmentPlan toPlan() {
311311
AssignmentPlan.Builder builder = AssignmentPlan.builder(nodes, deployments);
312312
for (Map.Entry<Tuple<AssignmentPlan.Deployment, Node>, Integer> assignment : tryAssigningRemainingCores().entrySet()) {
313+
// TODO (#101612) The model should be assigned to the node only when it is possible. This means, that canAssign should be
314+
// integrated into the assignModelToNode.
313315
if (builder.canAssign(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue())) {
314316
builder.assignModelToNode(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue());
315317
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/planning/ZoneAwareAssignmentPlanner.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,15 @@ private AssignmentPlan swapOriginalModelsInPlan(
200200
for (Map.Entry<Node, Integer> assignment : nodeAssignments.entrySet()) {
201201
Node originalNode = originalNodeById.get(assignment.getKey().id());
202202
planBuilder.assignModelToNode(originalDeployment, originalNode, assignment.getValue());
203-
// As the node has all its available memory we need to manually account memory of models with
204-
// current allocations.
205-
planBuilder.accountMemory(originalDeployment, originalNode);
203+
if (originalDeployment.currentAllocationsByNodeId().containsKey(originalNode.id())) {
204+
// TODO (#101612) requiredMemory should be calculated by the AssignmentPlan.Builder
205+
// As the node has all its available memory we need to manually account memory of models with
206+
// current allocations.
207+
long requiredMemory = originalDeployment.estimateMemoryUsageBytes(
208+
originalDeployment.currentAllocationsByNodeId().get(originalNode.id())
209+
);
210+
planBuilder.accountMemory(m, originalNode, requiredMemory);
211+
}
206212
}
207213
}
208214
return planBuilder.build();

0 commit comments

Comments
 (0)