[7.16] [ML] fix autoscaling capacity consistency (#81181) (#81221)

benwtrent · web-flow · commit 226091f1a270 · 2021-12-01T11:53:21.000-05:00
* [ML] fix autoscaling capacity consistency (#81181) In certain scenarios, it is possible to request a scale up but, a subsequent call, with no memory changes, a scale down would be requested. What this would look like in practice is many subsequent autoscaling actions taking place. It stems from the following situation: Scale up request is asked, a waiting job just barely doesn't fit at the current scale and tips it over into the "next scaling tier" When calculating if scale down is possible, the calculation of the JVM size when compared to the required native size is too small Scale down assumes a smaller node could work, but actually the true JVM percentage wouldn't allow it. This commit also adds two larger situational tests: One specific edge case that failed before this change A more random test that verifies that when a scale_up is asked and all jobs assigned, we do NOT ask for a scale down to a lower tier accidentally * Fixing backport * fixing backport
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java
@@ -916,6 +916,19 @@ Optional<AutoscalingDeciderResult> checkForScaleDown(
         long currentlyNecessaryTier = nodeLoads.stream().mapToLong(NodeLoad::getAssignedJobMemory).sum();
         // The required NATIVE node memory is the largest job and our static overhead.
         long currentlyNecessaryNode = largestJob == 0 ? 0 : largestJob + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes();
+        // If we are using `auto` && have at least one job, that means our native node size should be at least native capacity provided
+        // via our `MINIMUM_AUTOMATIC_NODE_SIZE`. Otherwise, if we have to auto-calculate the JVM size, it could be much smaller than
+        // what will truly be used.
+        if (currentlyNecessaryNode > 0 && useAuto) {
+            currentlyNecessaryNode = Math.max(
+                currentlyNecessaryNode,
+                NativeMemoryCalculator.allowedBytesForMl(
+                    NativeMemoryCalculator.MINIMUM_AUTOMATIC_NODE_SIZE,
+                    maxMachineMemoryPercent,
+                    useAuto
+                )
+            );
+        }
         // We consider a scale down if we are not fully utilizing the tier
         // Or our largest job could be on a smaller node (meaning the same size tier but smaller nodes are possible).
         if (currentlyNecessaryTier < currentCapacity.getTier() || currentlyNecessaryNode < currentCapacity.getNode()) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/NodeLoad.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/NodeLoad.java
@@ -193,6 +193,10 @@ public Builder setMaxMemory(long maxMemory) {
             return this;
         }
 
+        public long getMaxMemory() {
+            return maxMemory;
+        }
+
         public Builder setMaxJobs(int maxJobs) {
             this.maxJobs = maxJobs;
             return this;
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/NativeMemoryCalculator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/NativeMemoryCalculator.java
@@ -24,7 +24,7 @@
 public final class NativeMemoryCalculator {
 
     private static final long STATIC_JVM_UPPER_THRESHOLD = ByteSizeValue.ofGb(2).getBytes();
-    static final long MINIMUM_AUTOMATIC_NODE_SIZE = ByteSizeValue.ofGb(1).getBytes();
+    public static final long MINIMUM_AUTOMATIC_NODE_SIZE = ByteSizeValue.ofGb(1).getBytes();
     private static final long OS_OVERHEAD = ByteSizeValue.ofMb(200L).getBytes();
 
     private NativeMemoryCalculator() {}
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderServiceTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderServiceTests.java
@@ -7,6 +7,7 @@
 
 package org.elasticsearch.xpack.ml.autoscaling;
 
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.ClusterInfo;
 import org.elasticsearch.cluster.ClusterName;
@@ -21,6 +22,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.util.set.Sets;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
 import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
 import org.elasticsearch.test.ESTestCase;
@@ -33,6 +35,7 @@
 import org.elasticsearch.xpack.core.ml.action.StartDatafeedAction;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
 import org.elasticsearch.xpack.core.ml.job.config.JobState;
 import org.elasticsearch.xpack.core.ml.job.config.JobTaskState;
 import org.elasticsearch.xpack.ml.MachineLearning;
@@ -44,6 +47,7 @@
 import org.junit.Before;
 
 import java.time.Duration;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Date;
@@ -54,20 +58,51 @@
 import java.util.function.LongSupplier;
 import java.util.stream.Collectors;
 
+import static org.elasticsearch.xpack.ml.MachineLearning.MACHINE_MEMORY_NODE_ATTR;
+import static org.elasticsearch.xpack.ml.MachineLearning.MAX_JVM_SIZE_NODE_ATTR;
 import static org.elasticsearch.xpack.ml.job.JobNodeSelector.AWAITING_LAZY_ASSIGNMENT;
+import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.lessThan;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyBoolean;
 import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
 public class MlAutoscalingDeciderServiceTests extends ESTestCase {
 
+    private static final long[] NODE_TIERS = new long[] {
+        1073741824L,
+        2147483648L,
+        4294967296L,
+        8589934592L,
+        17179869184L,
+        34359738368L,
+        68719476736L,
+        16106127360L,
+        32212254720L,
+        64424509440L };
+
+    public static final List<Tuple<Long, Long>> AUTO_NODE_TIERS = org.elasticsearch.core.List.of(
+        Tuple.tuple(1073741824L, 432013312L), // 1GB and true JVM size
+        Tuple.tuple(2147483648L, 536870912L), // 2GB ...
+        Tuple.tuple(4294967296L, 1073741824L), // 4GB ...
+        Tuple.tuple(8589934592L, 2147483648L), // 8GB ...
+        Tuple.tuple(17179869184L, 2147483648L), // 16GB ...
+        Tuple.tuple(34359738368L, 2147483648L), // 32GB ...
+        Tuple.tuple(68719476736L, 2147483648L), // 64GB ...
+        Tuple.tuple(16106127360L, 2147483648L), // 15GB ...
+        Tuple.tuple(32212254720L, 2147483648L), // 30GB ...
+        Tuple.tuple(64424509440L, 2147483648L) // 60GB ...
+    );
+
     private static final long DEFAULT_NODE_SIZE = ByteSizeValue.ofGb(20).getBytes();
     private static final long DEFAULT_JVM_SIZE = ByteSizeValue.ofMb((long) (DEFAULT_NODE_SIZE * 0.25)).getBytes();
     private static final long DEFAULT_JOB_SIZE = ByteSizeValue.ofMb(200).getBytes();
@@ -104,6 +139,168 @@ public void setup() {
         when(clusterService.getClusterSettings()).thenReturn(cSettings);
     }
 
+    public void testScalingEdgeCase() {
+        // This scale up should push above 1gb, but under 2gb.
+        // The unassigned job barely doesn't fit within the current scale (by a handful of mb)
+        when(mlMemoryTracker.getAnomalyDetectorJobMemoryRequirement(any())).thenReturn(
+            ByteSizeValue.ofMb(128).getBytes() + Job.PROCESS_MEMORY_OVERHEAD.getBytes()
+        );
+        List<String> jobTasks = org.elasticsearch.core.List.of("waiting_job");
+        List<NodeLoad> nodesForScaleup = org.elasticsearch.core.List.of(
+            NodeLoad.builder("any")
+                .setMaxMemory(432013312)
+                .setUseMemory(true)
+                .incAssignedJobMemory(
+                    (long) (168.7 * 1024 + 0.5) + (long) (1.4 * 1024 * 1024 + 0.5) + ByteSizeValue.ofMb(256).getBytes()
+                        + Job.PROCESS_MEMORY_OVERHEAD.getBytes() * 3
+                )
+                .incNumAssignedJobs()
+                .incNumAssignedJobs()
+                .incNumAssignedJobs()
+                .build()
+        );
+        MlScalingReason.Builder reasonBuilder = new MlScalingReason.Builder().setPassedConfiguration(Settings.EMPTY)
+            .setCurrentMlCapacity(
+                AutoscalingCapacity.builder().node(null, AUTO_NODE_TIERS.get(0).v1()).total(null, AUTO_NODE_TIERS.get(0).v1()).build()
+            );
+        MlAutoscalingDeciderService service = buildService();
+        service.setUseAuto(true);
+        AutoscalingDeciderResult scaleUpResult = service.checkForScaleUp(
+            0,
+            0,
+            nodesForScaleup,
+            jobTasks,
+            org.elasticsearch.core.List.of(),
+            null,
+            new NativeMemoryCapacity(432013312, 432013312, 432013312L),
+            reasonBuilder
+        ).orElseThrow(() -> new ElasticsearchException("unexpected empty result for scale up"));
+
+        assertThat(
+            scaleUpResult.requiredCapacity().total().memory().getBytes(),
+            allOf(greaterThan(ByteSizeValue.ofGb(1).getBytes()), lessThan(ByteSizeValue.ofGb(2).getBytes()))
+        );
+
+        // Assume a scale up to 2gb nodes
+        // We should NOT scale down below or to 1gb given the same jobs with 2gb node
+        long bytesForML = autoBytesForMl(AUTO_NODE_TIERS.get(1).v1(), AUTO_NODE_TIERS.get(1).v2());
+        List<NodeLoad> nodeForScaleDown = org.elasticsearch.core.List.of(
+            NodeLoad.builder("any")
+                .setMaxMemory(bytesForML)
+                .setUseMemory(true)
+                .incAssignedJobMemory(
+                    (long) (168.7 * 1024 + 0.5) + (long) (1.4 * 1024 * 1024 + 0.5) + ByteSizeValue.ofMb(256).getBytes() + ByteSizeValue
+                        .ofMb(128)
+                        .getBytes() + Job.PROCESS_MEMORY_OVERHEAD.getBytes() * 4
+                )
+                .incNumAssignedJobs()
+                .incNumAssignedJobs()
+                .incNumAssignedJobs()
+                .incNumAssignedJobs()
+                .build()
+        );
+        reasonBuilder = new MlScalingReason.Builder().setPassedConfiguration(Settings.EMPTY)
+            .setCurrentMlCapacity(AutoscalingCapacity.builder().node(null, 2147483648L).total(null, 2147483648L).build());
+        AutoscalingDeciderResult result = service.checkForScaleDown(
+            nodeForScaleDown,
+            ByteSizeValue.ofMb(256).getBytes() + Job.PROCESS_MEMORY_OVERHEAD.getBytes(),
+            new NativeMemoryCapacity(bytesForML, bytesForML, 536870912L),
+            reasonBuilder
+        ).orElseThrow(() -> new ElasticsearchException("unexpected empty result for scale down"));
+        assertThat(
+            result.requiredCapacity().total().memory().getBytes(),
+            allOf(greaterThan(ByteSizeValue.ofGb(1).getBytes()), lessThan(ByteSizeValue.ofGb(2).getBytes()))
+        );
+    }
+
+    public void testScaleStability() {
+        for (int i = 0; i < 10; i++) {
+            for (int tier = 0; tier < AUTO_NODE_TIERS.size() - 1; tier++) {
+                Tuple<Long, Long> lowerTier = AUTO_NODE_TIERS.get(tier);
+                final long memoryForMl = autoBytesForMl(lowerTier.v1(), lowerTier.v2());
+                Tuple<Long, Long> upperTier = AUTO_NODE_TIERS.get(tier + 1);
+                // The jobs that currently exist, to use in the scaleUp call
+                NodeLoad.Builder forScaleUp = new NodeLoad.Builder("any").setMaxMemory(memoryForMl)
+                    .setMaxJobs(Integer.MAX_VALUE)
+                    .setUseMemory(true);
+                // The jobs + load that exists for all jobs (after scale up), used in scaleDown call
+                NodeLoad.Builder forScaleDown = new NodeLoad.Builder("any").setMaxMemory(autoBytesForMl(upperTier.v1(), upperTier.v2()))
+                    .setMaxJobs(Integer.MAX_VALUE)
+                    .setUseMemory(true);
+                long maxJob = 0;
+                // Fill with existing tier jobs
+                while (forScaleUp.getFreeMemory() > Job.PROCESS_MEMORY_OVERHEAD.getBytes()) {
+                    long jobSize = randomLongBetween(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), forScaleUp.getFreeMemory());
+                    maxJob = Math.max(jobSize, maxJob);
+                    forScaleUp.incNumAssignedJobs().incAssignedJobMemory(jobSize);
+                    forScaleDown.incNumAssignedJobs().incAssignedJobMemory(jobSize);
+                }
+                // Create jobs for scale up
+                NodeLoad nodeLoadForScaleUp = forScaleUp.build();
+                List<String> waitingJobs = new ArrayList<>();
+                while (forScaleDown.getFreeMemory() > Job.PROCESS_MEMORY_OVERHEAD.getBytes()) {
+                    long jobSize = randomLongBetween(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), forScaleDown.getFreeMemory());
+                    if (forScaleDown.getFreeMemory() - jobSize <= 0) {
+                        break;
+                    }
+                    maxJob = Math.max(jobSize, maxJob);
+                    forScaleDown.incNumAssignedJobs().incAssignedJobMemory(jobSize);
+                    String waitingJob = randomAlphaOfLength(10);
+                    when(mlMemoryTracker.getAnomalyDetectorJobMemoryRequirement(eq(waitingJob))).thenReturn(jobSize);
+                    waitingJobs.add(waitingJob);
+                }
+                MlAutoscalingDeciderService service = buildService();
+                service.setUseAuto(true);
+
+                AutoscalingDeciderResult scaleUpResult = service.checkForScaleUp(
+                    0,
+                    0,
+                    org.elasticsearch.core.List.of(nodeLoadForScaleUp),
+                    waitingJobs,
+                    org.elasticsearch.core.List.of(),
+                    null,
+                    new NativeMemoryCapacity(memoryForMl, memoryForMl, lowerTier.v2()),
+                    new MlScalingReason.Builder().setPassedConfiguration(Settings.EMPTY)
+                        .setCurrentMlCapacity(AutoscalingCapacity.builder().node(null, lowerTier.v1()).total(null, lowerTier.v1()).build())
+                ).orElseThrow(() -> new ElasticsearchException("unexpected empty result for scale down"));
+
+                assertThat(scaleUpResult.requiredCapacity().total().memory().getBytes(), greaterThan(lowerTier.v1()));
+                assertThat(scaleUpResult.requiredCapacity().node().memory().getBytes(), greaterThanOrEqualTo(lowerTier.v1()));
+                AutoscalingCapacity requiredScaleUp = scaleUpResult.requiredCapacity();
+                // Its possible that the next tier is above what we consider "upperTier"
+                // This is just fine for this test, as long as scale_down does not drop below this tier
+                int nextTier = Arrays.binarySearch(NODE_TIERS, requiredScaleUp.total().memory().getBytes());
+                if (nextTier < 0) {
+                    nextTier = -nextTier - 1;
+                }
+                // Its possible we requested a huge scale up, this is OK, we just don't have validation numbers that exist past a certain
+                // point.
+                if (nextTier >= NODE_TIERS.length) {
+                    return;
+                }
+                long size = NODE_TIERS[nextTier];
+                long scaledBytesForMl = autoBytesForMl(size, AUTO_NODE_TIERS.get(nextTier).v2());
+                // It could be that scale down doesn't occur, this is fine as we are "perfectly scaled"
+                Optional<AutoscalingDeciderResult> result = service.checkForScaleDown(
+                    org.elasticsearch.core.List.of(forScaleDown.build()),
+                    maxJob,
+                    new NativeMemoryCapacity(scaledBytesForMl, scaledBytesForMl, AUTO_NODE_TIERS.get(nextTier).v2()),
+                    new MlScalingReason.Builder().setPassedConfiguration(Settings.EMPTY)
+                        .setCurrentMlCapacity(AutoscalingCapacity.builder().node(null, size).total(null, size).build())
+                );
+                // If scale down is present, we don't want to drop below our current tier.
+                // If we do, that means that for the same jobs we scaled with, we calculated something incorrectly.
+                if (result.isPresent()) {
+                    int afterScaleDownTier = Arrays.binarySearch(NODE_TIERS, result.get().requiredCapacity().total().memory().getBytes());
+                    if (afterScaleDownTier < 0) {
+                        afterScaleDownTier = -afterScaleDownTier - 1;
+                    }
+                    assertThat(afterScaleDownTier, equalTo(nextTier));
+                }
+            }
+        }
+    }
+
     public void testScale_whenNotOnMaster() {
         MlAutoscalingDeciderService service = buildService();
         service.offMaster();
@@ -758,4 +955,21 @@ public SnapshotShardSizeInfo snapshotShardSizeInfo() {
         }
     }
 
+    private static long autoBytesForMl(Long nodeSize, Long jvmSize) {
+        return NativeMemoryCalculator.allowedBytesForMl(
+            new DiscoveryNode(
+                "node",
+                ESTestCase.buildNewFakeTransportAddress(),
+                MapBuilder.<String, String>newMapBuilder()
+                    .put(MAX_JVM_SIZE_NODE_ATTR, jvmSize.toString())
+                    .put(MACHINE_MEMORY_NODE_ATTR, nodeSize.toString())
+                    .map(),
+                DiscoveryNodeRole.BUILT_IN_ROLES,
+                Version.CURRENT
+            ),
+            30,
+            true
+        ).orElseThrow(() -> new ElasticsearchException("Unexpected null for calculating bytes for ML"));
+    }
+
 }
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/utils/NativeMemoryCalculatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/utils/NativeMemoryCalculatorTests.java
@@ -31,6 +31,7 @@
 import static org.elasticsearch.xpack.ml.MachineLearning.MAX_JVM_SIZE_NODE_ATTR;
 import static org.elasticsearch.xpack.ml.MachineLearning.MAX_MACHINE_MEMORY_PERCENT;
 import static org.elasticsearch.xpack.ml.MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT;
+import static org.elasticsearch.xpack.ml.autoscaling.MlAutoscalingDeciderServiceTests.AUTO_NODE_TIERS;
 import static org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator.MINIMUM_AUTOMATIC_NODE_SIZE;
 import static org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator.dynamicallyCalculateJvmSizeFromNodeSize;
 import static org.hamcrest.Matchers.equalTo;
@@ -57,18 +58,7 @@ public void testAllowedBytesForMLWhenAutoIsFalse() {
     }
 
     public void testConsistencyInAutoCalculation() {
-        for (Tuple<Long, Long> nodeAndJvmSize : Arrays.asList(
-            Tuple.tuple(1073741824L, 432013312L), // 1GB and true JVM size
-            Tuple.tuple(2147483648L, 536870912L), // 2GB ...
-            Tuple.tuple(4294967296L, 1073741824L), // 4GB ...
-            Tuple.tuple(8589934592L, 2147483648L), // 8GB ...
-            Tuple.tuple(17179869184L, 2147483648L), // 16GB ...
-            Tuple.tuple(34359738368L, 2147483648L), // 32GB ...
-            Tuple.tuple(68719476736L, 2147483648L), // 64GB ...
-            Tuple.tuple(16106127360L, 2147483648L), // 15GB ...
-            Tuple.tuple(32212254720L, 2147483648L), // 30GB ...
-            Tuple.tuple(64424509440L, 2147483648L) // 60GB ...
-        )) {
+        for (Tuple<Long, Long> nodeAndJvmSize : AUTO_NODE_TIERS) {
             final long trueJvmSize = nodeAndJvmSize.v2();
             final long trueNodeSize = nodeAndJvmSize.v1();
             List<Long> nodeSizes = Arrays.asList(