Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/133916.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 133916
summary: Fix model assignment error handling and assignment explanation generation
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ AssignmentPlan mergePreservedAllocations(AssignmentPlan assignmentPlan) {
0
);

long requiredMemory = mergedPlanBuilder.getDeploymentMemoryRequirement(deploymentNewAllocations, n, newAllocations);
if (newAllocations > 0 && mergedPlanBuilder.canAssign(deploymentNewAllocations, n, newAllocations, requiredMemory)) {
if (newAllocations > 0) {
mergedPlanBuilder.assignModelToNode(deploymentNewAllocations, n, newAllocations);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ public Builder assignModelToNode(Deployment deployment, Node node, int allocatio
}

public Builder assignModelToNode(Deployment deployment, Node node, int allocations, long requiredMemory) {
if (allocations <= 0) {
if (allocations <= 0 || canAssign(deployment, node, allocations, requiredMemory) == false) {
return this;
}
if (requiredMemory > remainingNodeMemory.get(node)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -310,11 +310,7 @@ private void unassignOversizedModels(Node n) {
private AssignmentPlan toPlan() {
AssignmentPlan.Builder builder = AssignmentPlan.builder(nodes, deployments);
for (Map.Entry<Tuple<AssignmentPlan.Deployment, Node>, Integer> assignment : tryAssigningRemainingCores().entrySet()) {
// TODO (#101612) The model should be assigned to the node only when it is possible. This means, that canAssign should be
// integrated into the assignModelToNode.
if (builder.canAssign(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue())) {
builder.assignModelToNode(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue());
}
builder.assignModelToNode(assignment.getKey().v1(), assignment.getKey().v2(), assignment.getValue());
}
return builder.build();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ private AssignmentPlan computePlan(boolean tryAssigningPreviouslyAssignedModels)
remainingZones,
tryAssigningPreviouslyAssignedModels
);

// Update remaining allocations to account for allocations satisfied in this zone
plan.deployments()
.forEach(
d -> deploymentIdToRemainingAllocations.computeIfPresent(
Expand Down Expand Up @@ -217,6 +219,14 @@ private AssignmentPlan swapOriginalDeploymentsInPlan(
return finalPlanBuilder.build();
}

/**
* The mergeAllocationsByNodeIdByDeploymentId method is responsible for consolidating allocation data
* from multiple AssignmentPlan objects into a single structure. This structure maps deployment IDs
* to their respective node allocations, allowing the system to track how resources are distributed
* across nodes for each deployment.
* @param plans List of AssignmentPlan objects to merge allocations from
* @return
*/
private Map<String, Map<String, Integer>> mergeAllocationsByNodeIdByDeploymentId(List<AssignmentPlan> plans) {
Map<String, Map<String, Integer>> allocationsByNodeIdByDeploymentId = new HashMap<>();
deployments.forEach(d -> allocationsByNodeIdByDeploymentId.put(d.deploymentId(), new HashMap<>()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ public void testCopyAssignments() {
assertThat(deployment2Assignments.get().get(node2), equalTo(1));
}

public void testRebalance_GivenDeploymentWithMemoryRequirements_ConsidersNativeExecutableOverhead() {
public void testRebalance_GivenDeploymentWithMemoryRequirements_ExplainMissingAllocations() {
// Create a node with just enough memory to fit the model plus native executable overhead
long modelMemory = ByteSizeValue.ofMb(200).getBytes();
long memoryOverhead = ByteSizeValue.ofMb(240).getBytes();
Expand Down