Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public class AdaptiveAllocationsScaler {
private final String deploymentId;
private final KalmanFilter1d requestRateEstimator;
private final KalmanFilter1d inferenceTimeEstimator;
private final long scaleToZeroAfterNoRequestsSeconds;
private double timeWithoutRequestsSeconds;

private int numberOfAllocations;
Expand All @@ -44,10 +45,11 @@ public class AdaptiveAllocationsScaler {
private Double lastMeasuredRequestRate;
private Double lastMeasuredInferenceTime;
private Long lastMeasuredQueueSize;
private long scaleToZeroAfterNoRequestsSeconds;

AdaptiveAllocationsScaler(String deploymentId, int numberOfAllocations, long scaleToZeroAfterNoRequestsSeconds) {
this.deploymentId = deploymentId;
this.scaleToZeroAfterNoRequestsSeconds = scaleToZeroAfterNoRequestsSeconds;

// A smoothing factor of 100 roughly means the last 100 measurements have an effect
// on the estimated values. The sampling time is 10 seconds, so approximately the
// last 15 minutes are taken into account.
Expand All @@ -67,7 +69,6 @@ public class AdaptiveAllocationsScaler {
lastMeasuredRequestRate = null;
lastMeasuredInferenceTime = null;
lastMeasuredQueueSize = null;
this.scaleToZeroAfterNoRequestsSeconds = scaleToZeroAfterNoRequestsSeconds;
}

void setMinMaxNumberOfAllocations(Integer minNumberOfAllocations, Integer maxNumberOfAllocations) {
Expand Down Expand Up @@ -117,6 +118,10 @@ void process(AdaptiveAllocationsScalerService.Stats stats, double timeIntervalSe
dynamicsChanged = false;
}

void resetTimeWithoutRequests() {
timeWithoutRequestsSeconds = 0;
}

double getLoadLower() {
double requestRateLower = Math.max(0.0, requestRateEstimator.lower());
double inferenceTimeLower = Math.max(0.0, inferenceTimeEstimator.hasValue() ? inferenceTimeEstimator.lower() : 1.0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,10 @@ Collection<DoubleWithAttributes> observeDouble(Function<AdaptiveAllocationsScale

/**
* The time interval without any requests that has to pass, before scaling down
* to zero allocations (in case min_allocations = 0).
* to zero allocations (in case min_allocations = 0). After this time interval
* without requests, the number of allocations is set to zero. When this time
* interval hasn't passed, the minimum number of allocations will always be
* larger than zero.
*/
private static final long SCALE_TO_ZERO_AFTER_NO_REQUESTS_TIME_SECONDS = TimeValue.timeValueMinutes(15).getSeconds();

Expand Down Expand Up @@ -447,6 +450,12 @@ public boolean maybeStartAllocation(TrainedModelAssignment assignment) {
deploymentIdsWithInFlightScaleFromZeroRequests.add(assignment.getDeploymentId());
updateNumberOfAllocations(assignment.getDeploymentId(), 1, cleanUpListener);
}

AdaptiveAllocationsScaler scaler = scalers.get(assignment.getDeploymentId());
if (scaler != null) {
scaler.resetTimeWithoutRequests();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If starting an allocation takes too long (longer than the inference request timeout of 10secs), the inference request fails prematurely, and never reaches the allocation. This means that it isn't counted in the node stats, the "timeWithoutRequests" of the autoscaler keeps ticking, and the new allocation is stopped too soon.

}

return true;
}
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,35 @@ public void testAutoscaling_scaleDownToZeroAllocations() {
}
}

public void testAutoscaling_resetTimeWithoutRequests() {
int scaleDownAfterInactivitySeconds = 60 * 15; // scale down to 0 after 15 minutes
AdaptiveAllocationsScaler adaptiveAllocationsScaler = new AdaptiveAllocationsScaler(
"test-deployment",
0,
scaleDownAfterInactivitySeconds
);

// 1 hour without requests, but call "reset" every 10 minutes, so don't scale.
for (int i = 0; i < 360; i++) {
adaptiveAllocationsScaler.process(new AdaptiveAllocationsScalerService.Stats(0, 0, 0, 0.05), 10, 0);
assertThat(adaptiveAllocationsScaler.scale(), nullValue());
if (i % 60 == 0) {
adaptiveAllocationsScaler.resetTimeWithoutRequests();
}
}

adaptiveAllocationsScaler.resetTimeWithoutRequests();
// 15 minutes with no requests, so don't scale.
for (int i = 0; i < 90; i++) {
adaptiveAllocationsScaler.process(new AdaptiveAllocationsScalerService.Stats(0, 0, 0, 0.05), 10, 1);
assertThat(adaptiveAllocationsScaler.scale(), nullValue());
}

// another second with no requests, so scale to zero allocations.
adaptiveAllocationsScaler.process(new AdaptiveAllocationsScalerService.Stats(0, 0, 0, 0.05), 1, 1);
assertThat(adaptiveAllocationsScaler.scale(), equalTo(0));
}

public void testAutoscaling_dontScaleDownToZeroAllocationsWhenMinAllocationsIsSet() {
AdaptiveAllocationsScaler adaptiveAllocationsScaler = new AdaptiveAllocationsScaler("test-deployment", 1, 60);
adaptiveAllocationsScaler.setMinMaxNumberOfAllocations(1, null);
Expand Down
Loading