Skip to content

Commit 447c3be

Browse files
committed
Fix scale up for model allocations (#115189)
1 parent 9b62098 commit 447c3be

File tree

2 files changed

+118
-1
lines changed

2 files changed

+118
-1
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ public boolean isEmpty() {
177177
return anomalyDetectionTasks.isEmpty()
178178
&& snapshotUpgradeTasks.isEmpty()
179179
&& dataframeAnalyticsTasks.isEmpty()
180-
&& modelAssignments.isEmpty();
180+
&& modelAssignments.values().stream().allMatch(assignment -> assignment.getTaskParams().getNumberOfAllocations() == 0);
181181
}
182182

183183
public List<String> findPartiallyAllocatedModels() {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderServiceTests.java

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@
2929
import org.elasticsearch.xpack.autoscaling.capacity.AutoscalingDeciderContext;
3030
import org.elasticsearch.xpack.autoscaling.capacity.AutoscalingDeciderResult;
3131
import org.elasticsearch.xpack.core.ml.MachineLearningField;
32+
import org.elasticsearch.xpack.core.ml.action.StartTrainedModelDeploymentAction;
33+
import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
34+
import org.elasticsearch.xpack.core.ml.inference.assignment.AssignmentState;
35+
import org.elasticsearch.xpack.core.ml.inference.assignment.Priority;
36+
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
37+
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignmentMetadata;
3238
import org.elasticsearch.xpack.core.ml.job.config.JobState;
3339
import org.elasticsearch.xpack.ml.MachineLearning;
3440
import org.elasticsearch.xpack.ml.job.NodeLoad;
@@ -48,6 +54,7 @@
4854
import static org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator.STATIC_JVM_UPPER_THRESHOLD;
4955
import static org.hamcrest.Matchers.containsString;
5056
import static org.hamcrest.Matchers.equalTo;
57+
import static org.hamcrest.Matchers.greaterThan;
5158
import static org.hamcrest.Matchers.is;
5259
import static org.hamcrest.Matchers.nullValue;
5360
import static org.mockito.ArgumentMatchers.any;
@@ -262,6 +269,116 @@ public void testScale_GivenUndeterminedMemory_ShouldReturnNullCapacity() {
262269
assertThat(result.requiredCapacity(), is(nullValue()));
263270
}
264271

272+
public void testScale_GivenModelWithZeroAllocations() {
273+
MlAutoscalingDeciderService service = buildService();
274+
service.onMaster();
275+
276+
ClusterState clusterState = new ClusterState.Builder(new ClusterName("cluster")).metadata(
277+
Metadata.builder()
278+
.putCustom(
279+
TrainedModelAssignmentMetadata.NAME,
280+
new TrainedModelAssignmentMetadata(
281+
Map.of(
282+
"model-with-zero-allocations",
283+
TrainedModelAssignment.Builder.empty(
284+
new StartTrainedModelDeploymentAction.TaskParams(
285+
"model-with-zero-allocations",
286+
"model-with-zero-allocations-deployment",
287+
400,
288+
0,
289+
2,
290+
100,
291+
null,
292+
Priority.NORMAL,
293+
0L,
294+
0L
295+
),
296+
new AdaptiveAllocationsSettings(true, 0, 4)
297+
).setAssignmentState(AssignmentState.STARTED).build()
298+
)
299+
)
300+
)
301+
.build()
302+
).nodes(DiscoveryNodes.builder().add(buildNode("ml-node", ByteSizeValue.ofGb(4), 8)).build()).build();
303+
304+
AutoscalingDeciderResult result = service.scale(
305+
Settings.EMPTY,
306+
new DeciderContext(
307+
clusterState,
308+
new AutoscalingCapacity(
309+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null),
310+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null)
311+
)
312+
)
313+
);
314+
// First call doesn't downscale as delay has not been satisfied
315+
assertThat(result.reason().summary(), containsString("down scale delay has not been satisfied"));
316+
317+
// Let's move time forward 1 hour
318+
timeSupplier.setOffset(TimeValue.timeValueHours(1));
319+
320+
result = service.scale(
321+
Settings.EMPTY,
322+
new DeciderContext(
323+
clusterState,
324+
new AutoscalingCapacity(
325+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null),
326+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null)
327+
)
328+
)
329+
);
330+
assertThat(result.reason().summary(), equalTo("Requesting scale down as tier and/or node size could be smaller"));
331+
assertThat(result.requiredCapacity().total().memory().getBytes(), equalTo(0L));
332+
assertThat(result.requiredCapacity().node().memory().getBytes(), equalTo(0L));
333+
}
334+
335+
public void testScale_GivenTrainedModelAllocationAndNoMlNode() {
336+
MlAutoscalingDeciderService service = buildService();
337+
service.onMaster();
338+
339+
ClusterState clusterState = new ClusterState.Builder(new ClusterName("cluster")).metadata(
340+
Metadata.builder()
341+
.putCustom(
342+
TrainedModelAssignmentMetadata.NAME,
343+
new TrainedModelAssignmentMetadata(
344+
Map.of(
345+
"model",
346+
TrainedModelAssignment.Builder.empty(
347+
new StartTrainedModelDeploymentAction.TaskParams(
348+
"model",
349+
"model-deployment",
350+
400,
351+
1,
352+
2,
353+
100,
354+
null,
355+
Priority.NORMAL,
356+
0L,
357+
0L
358+
),
359+
new AdaptiveAllocationsSettings(true, 0, 4)
360+
).setAssignmentState(AssignmentState.STARTING).build()
361+
)
362+
)
363+
)
364+
.build()
365+
).build();
366+
367+
AutoscalingDeciderResult result = service.scale(
368+
Settings.EMPTY,
369+
new DeciderContext(
370+
clusterState,
371+
new AutoscalingCapacity(AutoscalingCapacity.AutoscalingResources.ZERO, AutoscalingCapacity.AutoscalingResources.ZERO)
372+
)
373+
);
374+
375+
assertThat(result.reason().summary(), containsString("requesting scale up"));
376+
assertThat(result.requiredCapacity().total().memory().getBytes(), greaterThan(TEST_JOB_SIZE));
377+
assertThat(result.requiredCapacity().total().processors().count(), equalTo(2.0));
378+
assertThat(result.requiredCapacity().node().memory().getBytes(), greaterThan(TEST_JOB_SIZE));
379+
assertThat(result.requiredCapacity().node().processors().count(), equalTo(2.0));
380+
}
381+
265382
private DiscoveryNode buildNode(String id, ByteSizeValue machineMemory, int allocatedProcessors) {
266383
return DiscoveryNodeUtils.create(
267384
id,

0 commit comments

Comments
 (0)