elastic · jan-elastic · Oct 23, 2024 · Oct 23, 2024 · jan-elastic · Oct 23, 2024
diff --git a/...a/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java b/...a/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java
@@ -33,6 +33,7 @@ public class AdaptiveAllocationsScaler {
     private final String deploymentId;
     private final KalmanFilter1d requestRateEstimator;
     private final KalmanFilter1d inferenceTimeEstimator;
+    private final long scaleToZeroAfterNoRequestsSeconds;
     private double timeWithoutRequestsSeconds;
 
     private int numberOfAllocations;
@@ -44,10 +45,11 @@ public class AdaptiveAllocationsScaler {
     private Double lastMeasuredRequestRate;
     private Double lastMeasuredInferenceTime;
     private Long lastMeasuredQueueSize;
-    private long scaleToZeroAfterNoRequestsSeconds;
 
     AdaptiveAllocationsScaler(String deploymentId, int numberOfAllocations, long scaleToZeroAfterNoRequestsSeconds) {
         this.deploymentId = deploymentId;
+        this.scaleToZeroAfterNoRequestsSeconds = scaleToZeroAfterNoRequestsSeconds;
+
         // A smoothing factor of 100 roughly means the last 100 measurements have an effect
         // on the estimated values. The sampling time is 10 seconds, so approximately the
         // last 15 minutes are taken into account.
@@ -67,7 +69,6 @@ public class AdaptiveAllocationsScaler {
         lastMeasuredRequestRate = null;
         lastMeasuredInferenceTime = null;
         lastMeasuredQueueSize = null;
-        this.scaleToZeroAfterNoRequestsSeconds = scaleToZeroAfterNoRequestsSeconds;
     }
 
     void setMinMaxNumberOfAllocations(Integer minNumberOfAllocations, Integer maxNumberOfAllocations) {
@@ -117,6 +118,10 @@ void process(AdaptiveAllocationsScalerService.Stats stats, double timeIntervalSe
         dynamicsChanged = false;
     }
 
+    void resetTimeWithoutRequests() {
+        timeWithoutRequestsSeconds = 0;
+    }
+
     double getLoadLower() {
         double requestRateLower = Math.max(0.0, requestRateEstimator.lower());
         double inferenceTimeLower = Math.max(0.0, inferenceTimeEstimator.hasValue() ? inferenceTimeEstimator.lower() : 1.0);

diff --git a/...lasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java b/...lasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java
@@ -188,7 +188,10 @@ Collection<DoubleWithAttributes> observeDouble(Function<AdaptiveAllocationsScale
 
     /**
      * The time interval without any requests that has to pass, before scaling down
-     * to zero allocations (in case min_allocations = 0).
+     * to zero allocations (in case min_allocations = 0). After this time interval
+     * without requests, the number of allocations is set to zero. When this time
+     * interval hasn't passed, the minimum number of allocations will always be
+     * larger than zero.
      */
     private static final long SCALE_TO_ZERO_AFTER_NO_REQUESTS_TIME_SECONDS = TimeValue.timeValueMinutes(15).getSeconds();
 
@@ -447,6 +450,12 @@ public boolean maybeStartAllocation(TrainedModelAssignment assignment) {
                 deploymentIdsWithInFlightScaleFromZeroRequests.add(assignment.getDeploymentId());
                 updateNumberOfAllocations(assignment.getDeploymentId(), 1, cleanUpListener);
             }
+
+            AdaptiveAllocationsScaler scaler = scalers.get(assignment.getDeploymentId());
+            if (scaler != null) {
+                scaler.resetTimeWithoutRequests();
+            }
+
             return true;
         }
         return false;

diff --git a/.../elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerTests.java b/.../elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerTests.java
@@ -195,6 +195,35 @@ public void testAutoscaling_scaleDownToZeroAllocations() {
         }
     }
 
+    public void testAutoscaling_resetTimeWithoutRequests() {
+        int scaleDownAfterInactivitySeconds = 60 * 15; // scale down to 0 after 15 minutes
+        AdaptiveAllocationsScaler adaptiveAllocationsScaler = new AdaptiveAllocationsScaler(
+            "test-deployment",
+            0,
+            scaleDownAfterInactivitySeconds
+        );
+
+        // 1 hour without requests, but call "reset" every 10 minutes, so don't scale.
+        for (int i = 0; i < 360; i++) {
+            adaptiveAllocationsScaler.process(new AdaptiveAllocationsScalerService.Stats(0, 0, 0, 0.05), 10, 0);
+            assertThat(adaptiveAllocationsScaler.scale(), nullValue());
+            if (i % 60 == 0) {
+                adaptiveAllocationsScaler.resetTimeWithoutRequests();
+            }
+        }
+
+        adaptiveAllocationsScaler.resetTimeWithoutRequests();
+        // 15 minutes with no requests, so don't scale.
+        for (int i = 0; i < 90; i++) {
+            adaptiveAllocationsScaler.process(new AdaptiveAllocationsScalerService.Stats(0, 0, 0, 0.05), 10, 1);
+            assertThat(adaptiveAllocationsScaler.scale(), nullValue());
+        }
+
+        // another second with no requests, so scale to zero allocations.
+        adaptiveAllocationsScaler.process(new AdaptiveAllocationsScalerService.Stats(0, 0, 0, 0.05), 1, 1);
+        assertThat(adaptiveAllocationsScaler.scale(), equalTo(0));
+    }
+
     public void testAutoscaling_dontScaleDownToZeroAllocationsWhenMinAllocationsIsSet() {
         AdaptiveAllocationsScaler adaptiveAllocationsScaler = new AdaptiveAllocationsScaler("test-deployment", 1, 60);
         adaptiveAllocationsScaler.setMinMaxNumberOfAllocations(1, null);