Skip to content

Commit 8d6248d

Browse files
committed
Fix scale up for model allocations (#115189)
1 parent 9b62098 commit 8d6248d

File tree

2 files changed

+112
-1
lines changed

2 files changed

+112
-1
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ public boolean isEmpty() {
177177
return anomalyDetectionTasks.isEmpty()
178178
&& snapshotUpgradeTasks.isEmpty()
179179
&& dataframeAnalyticsTasks.isEmpty()
180-
&& modelAssignments.isEmpty();
180+
&& modelAssignments.values().stream().allMatch(assignment -> assignment.getTaskParams().getNumberOfAllocations() == 0);
181181
}
182182

183183
public List<String> findPartiallyAllocatedModels() {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderServiceTests.java

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import static org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator.STATIC_JVM_UPPER_THRESHOLD;
4949
import static org.hamcrest.Matchers.containsString;
5050
import static org.hamcrest.Matchers.equalTo;
51+
import static org.hamcrest.Matchers.greaterThan;
5152
import static org.hamcrest.Matchers.is;
5253
import static org.hamcrest.Matchers.nullValue;
5354
import static org.mockito.ArgumentMatchers.any;
@@ -262,6 +263,116 @@ public void testScale_GivenUndeterminedMemory_ShouldReturnNullCapacity() {
262263
assertThat(result.requiredCapacity(), is(nullValue()));
263264
}
264265

266+
public void testScale_GivenModelWithZeroAllocations() {
267+
MlAutoscalingDeciderService service = buildService();
268+
service.onMaster();
269+
270+
ClusterState clusterState = new ClusterState.Builder(new ClusterName("cluster")).metadata(
271+
Metadata.builder()
272+
.putCustom(
273+
TrainedModelAssignmentMetadata.NAME,
274+
new TrainedModelAssignmentMetadata(
275+
Map.of(
276+
"model-with-zero-allocations",
277+
TrainedModelAssignment.Builder.empty(
278+
new StartTrainedModelDeploymentAction.TaskParams(
279+
"model-with-zero-allocations",
280+
"model-with-zero-allocations-deployment",
281+
400,
282+
0,
283+
2,
284+
100,
285+
null,
286+
Priority.NORMAL,
287+
0L,
288+
0L
289+
),
290+
new AdaptiveAllocationsSettings(true, 0, 4)
291+
).setAssignmentState(AssignmentState.STARTED).build()
292+
)
293+
)
294+
)
295+
.build()
296+
).nodes(DiscoveryNodes.builder().add(buildNode("ml-node", ByteSizeValue.ofGb(4), 8)).build()).build();
297+
298+
AutoscalingDeciderResult result = service.scale(
299+
Settings.EMPTY,
300+
new DeciderContext(
301+
clusterState,
302+
new AutoscalingCapacity(
303+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null),
304+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null)
305+
)
306+
)
307+
);
308+
// First call doesn't downscale as delay has not been satisfied
309+
assertThat(result.reason().summary(), containsString("down scale delay has not been satisfied"));
310+
311+
// Let's move time forward 1 hour
312+
timeSupplier.setOffset(TimeValue.timeValueHours(1));
313+
314+
result = service.scale(
315+
Settings.EMPTY,
316+
new DeciderContext(
317+
clusterState,
318+
new AutoscalingCapacity(
319+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null),
320+
new AutoscalingCapacity.AutoscalingResources(null, ByteSizeValue.ofGb(4), null)
321+
)
322+
)
323+
);
324+
assertThat(result.reason().summary(), equalTo("Requesting scale down as tier and/or node size could be smaller"));
325+
assertThat(result.requiredCapacity().total().memory().getBytes(), equalTo(0L));
326+
assertThat(result.requiredCapacity().node().memory().getBytes(), equalTo(0L));
327+
}
328+
329+
public void testScale_GivenTrainedModelAllocationAndNoMlNode() {
330+
MlAutoscalingDeciderService service = buildService();
331+
service.onMaster();
332+
333+
ClusterState clusterState = new ClusterState.Builder(new ClusterName("cluster")).metadata(
334+
Metadata.builder()
335+
.putCustom(
336+
TrainedModelAssignmentMetadata.NAME,
337+
new TrainedModelAssignmentMetadata(
338+
Map.of(
339+
"model",
340+
TrainedModelAssignment.Builder.empty(
341+
new StartTrainedModelDeploymentAction.TaskParams(
342+
"model",
343+
"model-deployment",
344+
400,
345+
1,
346+
2,
347+
100,
348+
null,
349+
Priority.NORMAL,
350+
0L,
351+
0L
352+
),
353+
new AdaptiveAllocationsSettings(true, 0, 4)
354+
).setAssignmentState(AssignmentState.STARTING).build()
355+
)
356+
)
357+
)
358+
.build()
359+
).build();
360+
361+
AutoscalingDeciderResult result = service.scale(
362+
Settings.EMPTY,
363+
new DeciderContext(
364+
clusterState,
365+
new AutoscalingCapacity(AutoscalingCapacity.AutoscalingResources.ZERO, AutoscalingCapacity.AutoscalingResources.ZERO)
366+
)
367+
);
368+
369+
assertThat(result.reason().summary(), containsString("requesting scale up"));
370+
assertThat(result.requiredCapacity().total().memory().getBytes(), greaterThan(TEST_JOB_SIZE));
371+
assertThat(result.requiredCapacity().total().processors().count(), equalTo(2.0));
372+
assertThat(result.requiredCapacity().node().memory().getBytes(), greaterThan(TEST_JOB_SIZE));
373+
assertThat(result.requiredCapacity().node().processors().count(), equalTo(2.0));
374+
}
375+
265376
private DiscoveryNode buildNode(String id, ByteSizeValue machineMemory, int allocatedProcessors) {
266377
return DiscoveryNodeUtils.create(
267378
id,

0 commit comments

Comments
 (0)