13
13
import org .elasticsearch .ResourceAlreadyExistsException ;
14
14
import org .elasticsearch .cluster .node .DiscoveryNode ;
15
15
import org .elasticsearch .common .Strings ;
16
- import org .elasticsearch .common .settings .Settings ;
17
16
import org .elasticsearch .common .unit .ByteSizeValue ;
18
17
import org .elasticsearch .xpack .core .ml .action .StartTrainedModelDeploymentAction ;
19
18
import org .elasticsearch .xpack .core .ml .inference .assignment .Priority ;
@@ -51,20 +50,23 @@ class TrainedModelAssignmentRebalancer {
51
50
private final Map <DiscoveryNode , NodeLoad > nodeLoads ;
52
51
private final Map <List <String >, Collection <DiscoveryNode >> mlNodesByZone ;
53
52
private final Optional <StartTrainedModelDeploymentAction .TaskParams > deploymentToAdd ;
53
+ private final int allocatedProcessorsScale ;
54
54
55
55
TrainedModelAssignmentRebalancer (
56
56
TrainedModelAssignmentMetadata currentMetadata ,
57
57
Map <DiscoveryNode , NodeLoad > nodeLoads ,
58
58
Map <List <String >, Collection <DiscoveryNode >> mlNodesByZone ,
59
- Optional <StartTrainedModelDeploymentAction .TaskParams > deploymentToAdd
59
+ Optional <StartTrainedModelDeploymentAction .TaskParams > deploymentToAdd ,
60
+ int allocatedProcessorsScale
60
61
) {
61
62
this .currentMetadata = Objects .requireNonNull (currentMetadata );
62
63
this .nodeLoads = Objects .requireNonNull (nodeLoads );
63
64
this .mlNodesByZone = Objects .requireNonNull (mlNodesByZone );
64
65
this .deploymentToAdd = Objects .requireNonNull (deploymentToAdd );
66
+ this .allocatedProcessorsScale = allocatedProcessorsScale ;
65
67
}
66
68
67
- TrainedModelAssignmentMetadata .Builder rebalance (Settings settings ) {
69
+ TrainedModelAssignmentMetadata .Builder rebalance () {
68
70
if (deploymentToAdd .isPresent () && currentMetadata .hasDeployment (deploymentToAdd .get ().getDeploymentId ())) {
69
71
throw new ResourceAlreadyExistsException (
70
72
"[{}] assignment for deployment with model [{}] already exists" ,
@@ -78,8 +80,8 @@ TrainedModelAssignmentMetadata.Builder rebalance(Settings settings) {
78
80
return TrainedModelAssignmentMetadata .Builder .fromMetadata (currentMetadata );
79
81
}
80
82
81
- AssignmentPlan assignmentPlan = computeAssignmentPlan (settings );
82
- return buildAssignmentsFromPlan (assignmentPlan , settings );
83
+ AssignmentPlan assignmentPlan = computeAssignmentPlan ();
84
+ return buildAssignmentsFromPlan (assignmentPlan );
83
85
}
84
86
85
87
private boolean areAllModelsSatisfiedAndNoOutdatedRoutingEntries () {
@@ -92,8 +94,8 @@ private boolean areAllModelsSatisfiedAndNoOutdatedRoutingEntries() {
92
94
return true ;
93
95
}
94
96
95
- AssignmentPlan computeAssignmentPlan (Settings settings ) {
96
- final Map <List <String >, List <AssignmentPlan .Node >> nodesByZone = createNodesByZoneMap (settings );
97
+ AssignmentPlan computeAssignmentPlan () {
98
+ final Map <List <String >, List <AssignmentPlan .Node >> nodesByZone = createNodesByZoneMap ();
97
99
final Set <String > assignableNodeIds = nodesByZone .values ()
98
100
.stream ()
99
101
.flatMap (List ::stream )
@@ -271,7 +273,7 @@ private Map<String, Integer> findFittingAssignments(
271
273
return fittingAssignments ;
272
274
}
273
275
274
- private Map <List <String >, List <AssignmentPlan .Node >> createNodesByZoneMap (Settings settings ) {
276
+ private Map <List <String >, List <AssignmentPlan .Node >> createNodesByZoneMap () {
275
277
return mlNodesByZone .entrySet ().stream ().collect (Collectors .toMap (e -> e .getKey (), e -> {
276
278
Collection <DiscoveryNode > discoveryNodes = e .getValue ();
277
279
List <AssignmentPlan .Node > nodes = new ArrayList <>();
@@ -285,7 +287,7 @@ private Map<List<String>, List<AssignmentPlan.Node>> createNodesByZoneMap(Settin
285
287
// We subtract native inference memory as the planner expects available memory for
286
288
// native inference including current assignments.
287
289
getNodeFreeMemoryExcludingPerNodeOverheadAndNativeInference (load ),
288
- MlProcessors .get (discoveryNode , settings ).roundUp ()
290
+ MlProcessors .get (discoveryNode , allocatedProcessorsScale ).roundUp ()
289
291
)
290
292
);
291
293
} else {
@@ -305,7 +307,7 @@ private static long getNodeFreeMemoryExcludingPerNodeOverheadAndNativeInference(
305
307
return load .getFreeMemoryExcludingPerNodeOverhead () - load .getAssignedNativeInferenceMemory ();
306
308
}
307
309
308
- private TrainedModelAssignmentMetadata .Builder buildAssignmentsFromPlan (AssignmentPlan assignmentPlan , Settings settings ) {
310
+ private TrainedModelAssignmentMetadata .Builder buildAssignmentsFromPlan (AssignmentPlan assignmentPlan ) {
309
311
TrainedModelAssignmentMetadata .Builder builder = TrainedModelAssignmentMetadata .Builder .empty ();
310
312
for (AssignmentPlan .Deployment deployment : assignmentPlan .models ()) {
311
313
TrainedModelAssignment existingAssignment = currentMetadata .getDeploymentAssignment (deployment .id ());
@@ -343,7 +345,7 @@ private TrainedModelAssignmentMetadata.Builder buildAssignmentsFromPlan(Assignme
343
345
}
344
346
assignmentBuilder .calculateAndSetAssignmentState ();
345
347
346
- explainAssignments (assignmentPlan , nodeLoads , deployment , settings ).ifPresent (assignmentBuilder ::setReason );
348
+ explainAssignments (assignmentPlan , nodeLoads , deployment ).ifPresent (assignmentBuilder ::setReason );
347
349
builder .addNewAssignment (deployment .id (), assignmentBuilder );
348
350
}
349
351
return builder ;
@@ -352,8 +354,7 @@ private TrainedModelAssignmentMetadata.Builder buildAssignmentsFromPlan(Assignme
352
354
private Optional <String > explainAssignments (
353
355
AssignmentPlan assignmentPlan ,
354
356
Map <DiscoveryNode , NodeLoad > nodeLoads ,
355
- AssignmentPlan .Deployment deployment ,
356
- Settings settings
357
+ AssignmentPlan .Deployment deployment
357
358
) {
358
359
if (assignmentPlan .satisfiesAllocations (deployment )) {
359
360
return Optional .empty ();
@@ -365,7 +366,7 @@ private Optional<String> explainAssignments(
365
366
366
367
Map <String , String > nodeToReason = new TreeMap <>();
367
368
for (Map .Entry <DiscoveryNode , NodeLoad > nodeAndLoad : nodeLoads .entrySet ()) {
368
- Optional <String > reason = explainAssignment (assignmentPlan , nodeAndLoad .getKey (), nodeAndLoad .getValue (), deployment , settings );
369
+ Optional <String > reason = explainAssignment (assignmentPlan , nodeAndLoad .getKey (), nodeAndLoad .getValue (), deployment );
369
370
reason .ifPresent (s -> nodeToReason .put (nodeAndLoad .getKey ().getId (), s ));
370
371
}
371
372
@@ -384,8 +385,7 @@ private Optional<String> explainAssignment(
384
385
AssignmentPlan assignmentPlan ,
385
386
DiscoveryNode node ,
386
387
NodeLoad load ,
387
- AssignmentPlan .Deployment deployment ,
388
- Settings settings
388
+ AssignmentPlan .Deployment deployment
389
389
) {
390
390
if (Strings .isNullOrEmpty (load .getError ()) == false ) {
391
391
return Optional .of (load .getError ());
@@ -398,7 +398,7 @@ private Optional<String> explainAssignment(
398
398
// But we should also check if we managed to assign a model during the rebalance for which
399
399
// we check if the node has used up any of its allocated processors.
400
400
boolean isPerNodeOverheadAccountedFor = load .getNumAssignedJobsAndModels () > 0
401
- || assignmentPlan .getRemainingNodeCores (load .getNodeId ()) < MlProcessors .get (node , settings ).roundUp ();
401
+ || assignmentPlan .getRemainingNodeCores (load .getNodeId ()) < MlProcessors .get (node , allocatedProcessorsScale ).roundUp ();
402
402
long requiredMemory = deployment .memoryBytes () + (isPerNodeOverheadAccountedFor
403
403
? 0
404
404
: MachineLearning .NATIVE_EXECUTABLE_CODE_OVERHEAD .getBytes ());
@@ -427,7 +427,7 @@ private Optional<String> explainAssignment(
427
427
"This node has insufficient allocated processors. Available processors [{}], free processors [{}], "
428
428
+ "processors required for each allocation of this model [{}]" ,
429
429
new Object [] {
430
- MlProcessors .get (node , settings ).roundUp (),
430
+ MlProcessors .get (node , allocatedProcessorsScale ).roundUp (),
431
431
assignmentPlan .getRemainingNodeCores (node .getId ()),
432
432
deployment .threadsPerAllocation () }
433
433
)
0 commit comments