[FLINK-31215] Backpropagate processing rate limits from non-scalable bottlenecks to upstream operators

aplyusnin · aplyusnin · commit d25a4fc17666 · 2024-07-03T22:15:16.000+07:00
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/IntermediateScalingResult.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/IntermediateScalingResult.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.autoscaler;
+
+import org.apache.flink.runtime.jobgraph.JobVertexID;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/** Class for storing intermediate scaling results. */
+public class IntermediateScalingResult {
+
+    private final Map<JobVertexID, ScalingSummary> scalingSummaries;
+    private final List<JobVertexID> bottlenecks;
+
+    private double backpropagationScaleFactor = 1.0;
+
+    public IntermediateScalingResult() {
+        scalingSummaries = new HashMap<>();
+        bottlenecks = new ArrayList<>();
+    }
+
+    void addScalingSummary(JobVertexID vertex, ScalingSummary scalingSummary) {
+        scalingSummaries.put(vertex, scalingSummary);
+    }
+
+    void addBottleneckVertex(JobVertexID bottleneck, double factor) {
+        bottlenecks.add(bottleneck);
+        backpropagationScaleFactor = Math.min(backpropagationScaleFactor, factor);
+    }
+
+    public List<JobVertexID> getBottlenecks() {
+        return bottlenecks;
+    }
+
+    public double getBackpropagationScaleFactor() {
+        return backpropagationScaleFactor;
+    }
+
+    public Map<JobVertexID, ScalingSummary> getScalingSummaries() {
+        return scalingSummaries;
+    }
+}
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/JobVertexScaler.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/JobVertexScaler.java
@@ -71,21 +71,31 @@ public JobVertexScaler(AutoScalerEventHandler<KEY, Context> autoScalerEventHandl
         this.autoScalerEventHandler = autoScalerEventHandler;
     }
 
-    public int computeScaleTargetParallelism(
+    public VertexScalingResult computeScaleTargetParallelism(
             Context context,
             JobVertexID vertex,
             Collection<ShipStrategy> inputShipStrategies,
             Map<ScalingMetric, EvaluatedScalingMetric> evaluatedMetrics,
             SortedMap<Instant, ScalingSummary> history,
-            Duration restartTime) {
+            Duration restartTime,
+            double backpropagationScaleFactor) {
         var conf = context.getConfiguration();
+
+        boolean excluded =
+                conf.get(AutoScalerOptions.VERTEX_EXCLUDE_IDS).contains(vertex.toHexString());
+        if (excluded) {
+            LOG.debug(
+                    "Vertex {} is part of `vertex.exclude.ids` config, Check for bottleneck but not scale",
+                    vertex);
+        }
+
         var currentParallelism = (int) evaluatedMetrics.get(PARALLELISM).getCurrent();
         double averageTrueProcessingRate = evaluatedMetrics.get(TRUE_PROCESSING_RATE).getAverage();
         if (Double.isNaN(averageTrueProcessingRate)) {
             LOG.warn(
                     "True processing rate is not available for {}, cannot compute new parallelism",
                     vertex);
-            return currentParallelism;
+            return VertexScalingResult.normalScaling(currentParallelism);
         }
 
         double targetCapacity =
@@ -95,9 +105,11 @@ public int computeScaleTargetParallelism(
             LOG.warn(
                     "Target data rate is not available for {}, cannot compute new parallelism",
                     vertex);
-            return currentParallelism;
+            return VertexScalingResult.normalScaling(currentParallelism);
         }
 
+        targetCapacity *= backpropagationScaleFactor;
+
         LOG.debug("Target processing capacity for {} is {}", vertex, targetCapacity);
         double scaleFactor = targetCapacity / averageTrueProcessingRate;
         double minScaleFactor = 1 - conf.get(MAX_SCALE_DOWN_FACTOR);
@@ -122,32 +134,44 @@ public int computeScaleTargetParallelism(
         double cappedTargetCapacity = averageTrueProcessingRate * scaleFactor;
         LOG.debug("Capped target processing capacity for {} is {}", vertex, cappedTargetCapacity);
 
-        int newParallelism =
+        int parallelismLowerLimit =
+                excluded
+                        ? currentParallelism
+                        : Math.min(currentParallelism, conf.getInteger(VERTEX_MIN_PARALLELISM));
+        int parallelismUpperLimit =
+                excluded
+                        ? currentParallelism
+                        : Math.max(currentParallelism, conf.getInteger(VERTEX_MAX_PARALLELISM));
+
+        var scalingResult =
                 scale(
                         currentParallelism,
                         inputShipStrategies,
                         (int) evaluatedMetrics.get(MAX_PARALLELISM).getCurrent(),
                         scaleFactor,
-                        Math.min(currentParallelism, conf.getInteger(VERTEX_MIN_PARALLELISM)),
-                        Math.max(currentParallelism, conf.getInteger(VERTEX_MAX_PARALLELISM)));
+                        parallelismLowerLimit,
+                        parallelismUpperLimit);
 
-        if (newParallelism == currentParallelism
+        if (scalingResult.getParallelism() == currentParallelism
                 || blockScalingBasedOnPastActions(
                         context,
                         vertex,
                         conf,
                         evaluatedMetrics,
                         history,
                         currentParallelism,
-                        newParallelism)) {
-            return currentParallelism;
+                        scalingResult.getParallelism())) {
+            return new VertexScalingResult(
+                    currentParallelism,
+                    scalingResult.getBottleneckScaleFactor(),
+                    scalingResult.isBottleneck());
         }
 
         // We record our expectations for this scaling operation
         evaluatedMetrics.put(
                 ScalingMetric.EXPECTED_PROCESSING_RATE,
                 EvaluatedScalingMetric.of(cappedTargetCapacity));
-        return newParallelism;
+        return scalingResult;
     }
 
     private boolean blockScalingBasedOnPastActions(
@@ -249,9 +273,12 @@ private boolean detectIneffectiveScaleUp(
      * <p>Also, in order to ensure the data is evenly spread across subtasks, we try to adjust the
      * parallelism for source and keyed vertex such that it divides the maxParallelism without a
      * remainder.
+     *
+     * <p>If newParallelism exceeds min(parallelismUpperLimit, maxParallelism) the job vertex
+     * considered to be a bottleneck.
      */
     @VisibleForTesting
-    protected static int scale(
+    protected static VertexScalingResult scale(
             int currentParallelism,
             Collection<ShipStrategy> inputShipStrategies,
             int maxParallelism,
@@ -284,26 +311,36 @@ protected static int scale(
         // parallelism upper limit
         final int upperBound = Math.min(maxParallelism, parallelismUpperLimit);
 
+        boolean isBottleneck = false;
+        double bottleneckScaleFactor = 1.0;
+
+        // If required parallelism is higher than upper bound ---> the vertex is a bottleneck
+        if (newParallelism > upperBound) {
+            isBottleneck = true;
+            bottleneckScaleFactor = (double) upperBound / newParallelism;
+            newParallelism = upperBound;
+        }
+
         // Apply min/max parallelism
-        newParallelism = Math.min(Math.max(parallelismLowerLimit, newParallelism), upperBound);
+        newParallelism = Math.max(parallelismLowerLimit, newParallelism);
 
         var adjustByMaxParallelism =
                 inputShipStrategies.isEmpty() || inputShipStrategies.contains(HASH);
         if (!adjustByMaxParallelism) {
-            return newParallelism;
+            return new VertexScalingResult(newParallelism, bottleneckScaleFactor, isBottleneck);
         }
 
         // When the shuffle type of vertex inputs contains keyBy or vertex is a source, we try to
         // adjust the parallelism such that it divides the maxParallelism without a remainder
         // => data is evenly spread across subtasks
         for (int p = newParallelism; p <= maxParallelism / 2 && p <= upperBound; p++) {
             if (maxParallelism % p == 0) {
-                return p;
+                return new VertexScalingResult(p, bottleneckScaleFactor, isBottleneck);
             }
         }
 
         // If parallelism adjustment fails, use originally computed parallelism
-        return newParallelism;
+        return new VertexScalingResult(newParallelism, bottleneckScaleFactor, isBottleneck);
     }
 
     @VisibleForTesting
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java
@@ -49,6 +49,7 @@
 import java.util.SortedMap;
 
 import static org.apache.flink.autoscaler.config.AutoScalerOptions.EXCLUDED_PERIODS;
+import static org.apache.flink.autoscaler.config.AutoScalerOptions.PROCESSING_RATE_BACKPROPAGATION_ENABLED;
 import static org.apache.flink.autoscaler.config.AutoScalerOptions.SCALING_ENABLED;
 import static org.apache.flink.autoscaler.config.AutoScalerOptions.SCALING_EVENT_INTERVAL;
 import static org.apache.flink.autoscaler.event.AutoScalerEventHandler.SCALING_EXECUTION_DISABLED_REASON;
@@ -220,39 +221,77 @@ Map<JobVertexID, ScalingSummary> computeScalingSummary(
             return Map.of();
         }
 
-        var out = new HashMap<JobVertexID, ScalingSummary>();
-        var excludeVertexIdList =
-                context.getConfiguration().get(AutoScalerOptions.VERTEX_EXCLUDE_IDS);
+        var scalingResult =
+                computeScalingSummaryInternal(
+                        context, evaluatedMetrics, scalingHistory, restartTime, jobTopology, 1.0);
+
+        if (scalingResult.getBottlenecks().isEmpty()
+                || !context.getConfiguration()
+                        .getBoolean(PROCESSING_RATE_BACKPROPAGATION_ENABLED)) {
+            return scalingResult.getScalingSummaries();
+        }
+
+        LOG.info("Vertices with ids {} are bottlenecks", scalingResult.getBottlenecks());
+
+        double backpropagationScaleFactor = scalingResult.getBackpropagationScaleFactor();
+
+        LOG.info(
+                "Processing rate back propagation scaling factor is {}",
+                backpropagationScaleFactor);
+
+        scalingResult =
+                computeScalingSummaryInternal(
+                        context,
+                        evaluatedMetrics,
+                        scalingHistory,
+                        restartTime,
+                        jobTopology,
+                        backpropagationScaleFactor);
+
+        return scalingResult.getScalingSummaries();
+    }
+
+    IntermediateScalingResult computeScalingSummaryInternal(
+            Context context,
+            EvaluatedMetrics evaluatedMetrics,
+            Map<JobVertexID, SortedMap<Instant, ScalingSummary>> scalingHistory,
+            Duration restartTime,
+            JobTopology jobTopology,
+            double backpropagationScaleFactor) {
+
+        var scalingResult = new IntermediateScalingResult();
         evaluatedMetrics
                 .getVertexMetrics()
                 .forEach(
                         (v, metrics) -> {
-                            if (excludeVertexIdList.contains(v.toHexString())) {
-                                LOG.debug(
-                                        "Vertex {} is part of `vertex.exclude.ids` config, Ignoring it for scaling",
-                                        v);
-                            } else {
-                                var currentParallelism =
-                                        (int) metrics.get(ScalingMetric.PARALLELISM).getCurrent();
-
-                                var newParallelism =
-                                        jobVertexScaler.computeScaleTargetParallelism(
-                                                context,
-                                                v,
-                                                jobTopology.get(v).getInputs().values(),
-                                                metrics,
-                                                scalingHistory.getOrDefault(
-                                                        v, Collections.emptySortedMap()),
-                                                restartTime);
-                                if (currentParallelism != newParallelism) {
-                                    out.put(
+                            var currentParallelism =
+                                    (int) metrics.get(ScalingMetric.PARALLELISM).getCurrent();
+
+                            var newParallelism =
+                                    jobVertexScaler.computeScaleTargetParallelism(
+                                            context,
                                             v,
-                                            new ScalingSummary(
-                                                    currentParallelism, newParallelism, metrics));
-                                }
+                                            jobTopology.get(v).getInputs().values(),
+                                            metrics,
+                                            scalingHistory.getOrDefault(
+                                                    v, Collections.emptySortedMap()),
+                                            restartTime,
+                                            backpropagationScaleFactor);
+                            if (currentParallelism != newParallelism.getParallelism()) {
+                                scalingResult.addScalingSummary(
+                                        v,
+                                        new ScalingSummary(
+                                                currentParallelism,
+                                                newParallelism.getParallelism(),
+                                                metrics));
+                            }
+                            // Even if parallelism didn't change, vertex can be a bottleneck
+                            if (newParallelism.isBottleneck()) {
+                                scalingResult.addBottleneckVertex(
+                                        v, newParallelism.getBottleneckScaleFactor());
                             }
                         });
-        return out;
+        return scalingResult;
     }
 
     private boolean isJobUnderMemoryPressure(
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/VertexScalingResult.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/VertexScalingResult.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.autoscaler;
+
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+
+/** Class for storing information on how a single vertex is scaled. */
+@AllArgsConstructor
+@Getter
+public class VertexScalingResult {
+    private int parallelism;
+    private double bottleneckScaleFactor;
+    private boolean isBottleneck;
+
+    public static VertexScalingResult normalScaling(int parallelism) {
+        return new VertexScalingResult(parallelism, 1.0, false);
+    }
+}
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/config/AutoScalerOptions.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/config/AutoScalerOptions.java
@@ -58,6 +58,15 @@ private static ConfigOptions.OptionBuilder autoScalerConfig(String key) {
                     .withDescription(
                             "Enable vertex scaling execution by the autoscaler. If disabled, the autoscaler will only collect metrics and evaluate the suggested parallelism for each vertex but will not upgrade the jobs.");
 
+    public static final ConfigOption<Boolean> PROCESSING_RATE_BACKPROPAGATION_ENABLED =
+            autoScalerConfig("processing.rate.backpropagation.enabled")
+                    .booleanType()
+                    .defaultValue(false)
+                    .withFallbackKeys(
+                            oldOperatorConfigKey("processing.rate.backpropagation.enabled"))
+                    .withDescription(
+                            "Enable backpropagation of processing rate during autoscaling to reduce resources usage.");
+
     public static final ConfigOption<Duration> METRICS_WINDOW =
             autoScalerConfig("metrics.window")
                     .durationType()
@@ -313,7 +322,7 @@ private static ConfigOptions.OptionBuilder autoScalerConfig(String key) {
                     .defaultValues()
                     .withFallbackKeys(oldOperatorConfigKey("vertex.exclude.ids"))
                     .withDescription(
-                            "A (semicolon-separated) list of vertex ids in hexstring for which to disable scaling. Caution: For non-sink vertices this will still scale their downstream operators until https://issues.apache.org/jira/browse/FLINK-31215 is implemented.");
+                            "A (semicolon-separated) list of vertex ids in hexstring for which to disable scaling.");
 
     public static final ConfigOption<Duration> SCALING_EVENT_INTERVAL =
             autoScalerConfig("scaling.event.interval")
diff --git a/flink-autoscaler/src/test/java/org/apache/flink/autoscaler/JobVertexScalerTest.java b/flink-autoscaler/src/test/java/org/apache/flink/autoscaler/JobVertexScalerTest.java
diff --git a/flink-autoscaler/src/test/java/org/apache/flink/autoscaler/ScalingExecutorTest.java b/flink-autoscaler/src/test/java/org/apache/flink/autoscaler/ScalingExecutorTest.java