Skip to content

Commit 571d611

Browse files
[8.4][ML] Validate trained model deployment queue_capacity limit (#89611)
When starting a trained model deployment, a queue is created. If the queue_capacity is too large, it can lead to OOM and a node crash. This commit adds validation that the queue_capacity cannot be more than 1M. Closes #89555
1 parent 85d40b6 commit 571d611

File tree

4 files changed

+35
-3
lines changed

4 files changed

+35
-3
lines changed

docs/changelog/89611.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 89611
2+
summary: "[ML] Validate trained model deployment `queue_capacity` limit"
3+
area: Machine Learning
4+
type: bug
5+
issues:
6+
- 89555

docs/reference/ml/trained-models/apis/start-trained-model-deployment.asciidoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ Defaults to 1.
7171
Controls how many inference requests are allowed in the queue at a time.
7272
Every machine learning node in the cluster where the model can be allocated
7373
has a queue of this size; when the number of requests exceeds the total value,
74-
new requests are rejected with a 429 error. Defaults to 1024.
74+
new requests are rejected with a 429 error. Defaults to 1024. Max allowed value is 1000000.
7575

7676
`threads_per_allocation`::
7777
(Optional, integer)

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentAction.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ public static class Request extends MasterNodeRequest<Request> implements ToXCon
7171
AllocationStatus.State.FULLY_ALLOCATED };
7272

7373
private static final int MAX_THREADS_PER_ALLOCATION = 32;
74+
/**
75+
* If the queue is created then we can OOM when we create the queue.
76+
*/
77+
private static final int MAX_QUEUE_CAPACITY = 1_000_000;
7478

7579
public static final ParseField MODEL_ID = new ParseField("model_id");
7680
public static final ParseField TIMEOUT = new ParseField("timeout");
@@ -248,6 +252,9 @@ public ActionRequestValidationException validate() {
248252
if (queueCapacity < 1) {
249253
validationException.addValidationError("[" + QUEUE_CAPACITY + "] must be a positive integer");
250254
}
255+
if (queueCapacity > MAX_QUEUE_CAPACITY) {
256+
validationException.addValidationError("[" + QUEUE_CAPACITY + "] must be less than " + MAX_QUEUE_CAPACITY);
257+
}
251258
return validationException.validationErrors().isEmpty() ? null : validationException;
252259
}
253260

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentRequestTests.java

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,13 @@ public static Request createRandom() {
5353
request.setWaitForState(randomFrom(AllocationStatus.State.values()));
5454
}
5555
if (randomBoolean()) {
56-
request.setThreadsPerAllocation(randomIntBetween(1, 8));
56+
request.setThreadsPerAllocation(randomFrom(1, 2, 4, 8, 16, 32));
5757
}
5858
if (randomBoolean()) {
5959
request.setNumberOfAllocations(randomIntBetween(1, 8));
6060
}
6161
if (randomBoolean()) {
62-
request.setQueueCapacity(randomIntBetween(1, 10000));
62+
request.setQueueCapacity(randomIntBetween(1, 1000000));
6363
}
6464
return request;
6565
}
@@ -150,6 +150,25 @@ public void testValidate_GivenQueueCapacityIsNegative() {
150150
assertThat(e.getMessage(), containsString("[queue_capacity] must be a positive integer"));
151151
}
152152

153+
public void testValidate_GivenQueueCapacityIsAtLimit() {
154+
Request request = createRandom();
155+
request.setQueueCapacity(1_000_000);
156+
157+
ActionRequestValidationException e = request.validate();
158+
159+
assertThat(e, is(nullValue()));
160+
}
161+
162+
public void testValidate_GivenQueueCapacityIsOverLimit() {
163+
Request request = createRandom();
164+
request.setQueueCapacity(1_000_001);
165+
166+
ActionRequestValidationException e = request.validate();
167+
168+
assertThat(e, is(not(nullValue())));
169+
assertThat(e.getMessage(), containsString("[queue_capacity] must be less than 1000000"));
170+
}
171+
153172
public void testDefaults() {
154173
Request request = new Request(randomAlphaOfLength(10));
155174
assertThat(request.getTimeout(), equalTo(TimeValue.timeValueSeconds(20)));

0 commit comments

Comments
 (0)