[FLINK-36192][autoscaler] Autocaler supports adjusts the parallelism of the Source to the number of partitions in kafka or pulsar

huyuanfeng · huyuanfeng · commit b179ba51aa62 · 2024-10-09T10:58:28.000+08:00
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/JobVertexScaler.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/JobVertexScaler.java
@@ -18,6 +18,7 @@
 package org.apache.flink.autoscaler;
 
 import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.java.tuple.Tuple2;
 import org.apache.flink.autoscaler.config.AutoScalerOptions;
 import org.apache.flink.autoscaler.event.AutoScalerEventHandler;
 import org.apache.flink.autoscaler.metrics.EvaluatedScalingMetric;
@@ -39,6 +40,7 @@
 import java.util.Collection;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.SortedMap;
 
 import static org.apache.flink.autoscaler.config.AutoScalerOptions.MAX_SCALE_DOWN_FACTOR;
@@ -50,6 +52,7 @@
 import static org.apache.flink.autoscaler.config.AutoScalerOptions.VERTEX_MIN_PARALLELISM;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.EXPECTED_PROCESSING_RATE;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.MAX_PARALLELISM;
+import static org.apache.flink.autoscaler.metrics.ScalingMetric.NUM_PARTITIONS;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.PARALLELISM;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.TRUE_PROCESSING_RATE;
 import static org.apache.flink.autoscaler.topology.ShipStrategy.HASH;
@@ -66,6 +69,12 @@ public class JobVertexScaler<KEY, Context extends JobAutoScalerContext<KEY>> {
     protected static final String INEFFECTIVE_MESSAGE_FORMAT =
             "Ineffective scaling detected for %s (expected increase: %s, actual increase %s). Blocking of ineffective scaling decisions is %s";
 
+    @VisibleForTesting protected static final String SCALE_LIMITED = "ScalingLimited";
+
+    @VisibleForTesting
+    protected static final String SCALE_LIMITED_MESSAGE_FORMAT =
+            "Scaling limited detected for %s (expected parallelism: %s, actual parallelism %s). Scaling limited due to %s";
+
     private Clock clock = Clock.system(ZoneId.systemDefault());
 
     private final AutoScalerEventHandler<KEY, Context> autoScalerEventHandler;
@@ -191,16 +200,29 @@ public ParallelismChange computeScaleTargetParallelism(
         double cappedTargetCapacity = averageTrueProcessingRate * scaleFactor;
         LOG.debug("Capped target processing capacity for {} is {}", vertex, cappedTargetCapacity);
 
-        int newParallelism =
+        Tuple2<Integer, Optional<String>> newParallelism =
                 scale(
+                        vertex,
                         currentParallelism,
                         inputShipStrategies,
+                        (int) evaluatedMetrics.get(NUM_PARTITIONS).getCurrent(),
                         (int) evaluatedMetrics.get(MAX_PARALLELISM).getCurrent(),
                         scaleFactor,
                         Math.min(currentParallelism, conf.getInteger(VERTEX_MIN_PARALLELISM)),
                         Math.max(currentParallelism, conf.getInteger(VERTEX_MAX_PARALLELISM)));
 
-        if (newParallelism == currentParallelism) {
+        newParallelism.f1.ifPresent(
+                message -> {
+                    autoScalerEventHandler.handleEvent(
+                            context,
+                            AutoScalerEventHandler.Type.Warning,
+                            SCALE_LIMITED,
+                            message,
+                            SCALE_LIMITED + vertex + cappedTargetCapacity,
+                            conf.get(SCALING_EVENT_INTERVAL));
+                });
+
+        if (newParallelism.f0 == currentParallelism) {
             // Clear delayed scale down request if the new parallelism is equal to
             // currentParallelism.
             delayedScaleDown.clearVertex(vertex);
@@ -219,7 +241,7 @@ public ParallelismChange computeScaleTargetParallelism(
                 evaluatedMetrics,
                 history,
                 currentParallelism,
-                newParallelism,
+                newParallelism.f0,
                 delayedScaleDown);
     }
 
@@ -345,11 +367,16 @@ private boolean detectIneffectiveScaleUp(
      * <p>Also, in order to ensure the data is evenly spread across subtasks, we try to adjust the
      * parallelism for source and keyed vertex such that it divides the maxParallelism without a
      * remainder.
+     *
+     * <p>This method also attempts to adjust the parallelism to ensure it aligns well with the
+     * number of partitions if a vertex has a known partition count.
      */
     @VisibleForTesting
-    protected static int scale(
+    protected static Tuple2<Integer, Optional<String>> scale(
+            JobVertexID vertex,
             int currentParallelism,
             Collection<ShipStrategy> inputShipStrategies,
+            int numPartitions,
             int maxParallelism,
             double scaleFactor,
             int parallelismLowerLimit,
@@ -378,28 +405,70 @@ protected static int scale(
 
         // Cap parallelism at either maxParallelism(number of key groups or source partitions) or
         // parallelism upper limit
-        final int upperBound = Math.min(maxParallelism, parallelismUpperLimit);
+        int upperBound = Math.min(maxParallelism, parallelismUpperLimit);
 
         // Apply min/max parallelism
         newParallelism = Math.min(Math.max(parallelismLowerLimit, newParallelism), upperBound);
 
         var adjustByMaxParallelism =
                 inputShipStrategies.isEmpty() || inputShipStrategies.contains(HASH);
         if (!adjustByMaxParallelism) {
-            return newParallelism;
+            return Tuple2.of(newParallelism, Optional.empty());
         }
 
-        // When the shuffle type of vertex inputs contains keyBy or vertex is a source, we try to
-        // adjust the parallelism such that it divides the maxParallelism without a remainder
-        // => data is evenly spread across subtasks
-        for (int p = newParallelism; p <= maxParallelism / 2 && p <= upperBound; p++) {
-            if (maxParallelism % p == 0) {
-                return p;
+        if (numPartitions <= 0) {
+            // When the shuffle type of vertex inputs contains keyBy or vertex is a source,
+            // we try to adjust the parallelism such that it divides the maxParallelism without a
+            // remainder => data is evenly spread across subtasks
+            for (int p = newParallelism; p <= maxParallelism / 2 && p <= upperBound; p++) {
+                if (maxParallelism % p == 0) {
+                    return Tuple2.of(p, Optional.empty());
+                }
+            }
+            // If parallelism adjustment fails, use originally computed parallelism
+            return Tuple2.of(newParallelism, Optional.empty());
+        } else {
+
+            // When we know the numPartitions at a vertex,
+            // adjust the parallelism such that it divides the numPartitions without a remainder
+            // => Data is evenly distributed among subtasks
+            for (int p = newParallelism; p <= upperBound && p <= numPartitions; p++) {
+                if (numPartitions % p == 0) {
+                    return Tuple2.of(p, Optional.empty());
+                }
             }
-        }
 
-        // If parallelism adjustment fails, use originally computed parallelism
-        return newParallelism;
+            // When the degree of parallelism after rounding up cannot be evenly divided by source
+            // PartitionCount, Try to find the smallest parallelism that can satisfy the current
+            // consumption rate.
+            for (int p = newParallelism; p > parallelismLowerLimit; p--) {
+                if (numPartitions / p > numPartitions / newParallelism) {
+                    if (numPartitions % p != 0) {
+                        p += 1;
+                    }
+                    var message =
+                            String.format(
+                                    SCALE_LIMITED_MESSAGE_FORMAT,
+                                    vertex,
+                                    newParallelism,
+                                    p,
+                                    String.format(
+                                            "numPartitions : %s，upperBound(maxParallelism or "
+                                                    + "parallelismUpperLimit): %s",
+                                            numPartitions, upperBound));
+                    return Tuple2.of(p, Optional.of(message));
+                }
+            }
+            // If a suitable degree of parallelism cannot be found, return parallelismLowerLimit
+            var message =
+                    String.format(
+                            SCALE_LIMITED_MESSAGE_FORMAT,
+                            vertex,
+                            newParallelism,
+                            parallelismLowerLimit,
+                            String.format("parallelismLowerLimit : %s", parallelismLowerLimit));
+            return Tuple2.of(parallelismLowerLimit, Optional.of(message));
+        }
     }
 
     @VisibleForTesting
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingMetricCollector.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingMetricCollector.java
@@ -204,7 +204,7 @@ protected JobTopology getJobTopology(
 
         Set<JobVertexID> vertexSet = Set.copyOf(t.getVerticesInTopologicalOrder());
         updateVertexList(stateStore, ctx, clock.instant(), vertexSet);
-        updateKafkaPulsarSourceMaxParallelisms(ctx, jobDetailsInfo.getJobId(), t);
+        updateKafkaPulsarSourceNumPartitions(ctx, jobDetailsInfo.getJobId(), t);
         excludeVerticesFromScaling(ctx.getConfiguration(), t.getFinishedVertices());
         return t;
     }
@@ -249,7 +249,7 @@ protected JobTopology getJobTopology(JobDetailsInfo jobDetailsInfo) {
                 json, slotSharingGroupIdMap, maxParallelismMap, metrics, finished);
     }
 
-    private void updateKafkaPulsarSourceMaxParallelisms(
+    private void updateKafkaPulsarSourceNumPartitions(
             Context ctx, JobID jobId, JobTopology topology) throws Exception {
         try (var restClient = ctx.getRestClusterClient()) {
             Pattern partitionRegex =
@@ -284,7 +284,7 @@ private void updateKafkaPulsarSourceMaxParallelisms(
                                 "Updating source {} max parallelism based on available partitions to {}",
                                 sourceVertex,
                                 numPartitions);
-                        topology.get(sourceVertex).updateMaxParallelism((int) numPartitions);
+                        topology.get(sourceVertex).setNumPartitions((int) numPartitions);
                     }
                 }
             }
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingMetricEvaluator.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingMetricEvaluator.java
@@ -55,6 +55,7 @@
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.MANAGED_MEMORY_USED;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.MAX_PARALLELISM;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.METASPACE_MEMORY_USED;
+import static org.apache.flink.autoscaler.metrics.ScalingMetric.NUM_PARTITIONS;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.NUM_TASK_SLOTS_USED;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.OBSERVED_TPR;
 import static org.apache.flink.autoscaler.metrics.ScalingMetric.PARALLELISM;
@@ -166,6 +167,10 @@ private Map<ScalingMetric, EvaluatedScalingMetric> evaluateMetrics(
 
         evaluatedMetrics.put(
                 MAX_PARALLELISM, EvaluatedScalingMetric.of(vertexInfo.getMaxParallelism()));
+
+        evaluatedMetrics.put(
+                NUM_PARTITIONS, EvaluatedScalingMetric.of(vertexInfo.getNumPartitions()));
+
         computeProcessingRateThresholds(evaluatedMetrics, conf, processingBacklog, restartTime);
         return evaluatedMetrics;
     }
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/metrics/ScalingMetric.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/metrics/ScalingMetric.java
@@ -53,6 +53,9 @@ public enum ScalingMetric {
 
     /** Job vertex max parallelism. */
     MAX_PARALLELISM(false),
+
+    /** Source vertex partition count. */
+    NUM_PARTITIONS(false),
     /** Upper boundary of the target data rate range. */
     SCALE_UP_RATE_THRESHOLD(false),
 
diff --git a/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/topology/VertexInfo.java b/flink-autoscaler/src/main/java/org/apache/flink/autoscaler/topology/VertexInfo.java
@@ -46,6 +46,8 @@ public class VertexInfo {
     @Setter(AccessLevel.NONE)
     private int maxParallelism;
 
+    @Setter private int numPartitions;
+
     private final int originalMaxParallelism;
 
     private final boolean finished;
@@ -99,8 +101,4 @@ public VertexInfo(
             int maxParallelism) {
         this(id, inputs, parallelism, maxParallelism, null);
     }
-
-    public void updateMaxParallelism(int maxParallelism) {
-        this.maxParallelism = Math.min(originalMaxParallelism, maxParallelism);
-    }
 }
diff --git a/flink-autoscaler/src/test/java/org/apache/flink/autoscaler/JobVertexScalerTest.java b/flink-autoscaler/src/test/java/org/apache/flink/autoscaler/JobVertexScalerTest.java

Original file line number	Diff line number	Diff line change
`@@ -204,7 +204,7 @@ protected JobTopology getJobTopology(`
`204`	`204`
`205`	`205`	`Set<JobVertexID> vertexSet = Set.copyOf(t.getVerticesInTopologicalOrder());`
`206`	`206`	`updateVertexList(stateStore, ctx, clock.instant(), vertexSet);`
`207`		`- updateKafkaPulsarSourceMaxParallelisms(ctx, jobDetailsInfo.getJobId(), t);`
	`207`	`+ updateKafkaPulsarSourceNumPartitions(ctx, jobDetailsInfo.getJobId(), t);`
`208`	`208`	`excludeVerticesFromScaling(ctx.getConfiguration(), t.getFinishedVertices());`
`209`	`209`	`return t;`
`210`	`210`	`}`
`@@ -249,7 +249,7 @@ protected JobTopology getJobTopology(JobDetailsInfo jobDetailsInfo) {`
`249`	`249`	`json, slotSharingGroupIdMap, maxParallelismMap, metrics, finished);`
`250`	`250`	`}`
`251`	`251`
`252`		`- private void updateKafkaPulsarSourceMaxParallelisms(`
	`252`	`+ private void updateKafkaPulsarSourceNumPartitions(`
`253`	`253`	`Context ctx, JobID jobId, JobTopology topology) throws Exception {`
`254`	`254`	`try (var restClient = ctx.getRestClusterClient()) {`
`255`	`255`	`Pattern partitionRegex =`
`@@ -284,7 +284,7 @@ private void updateKafkaPulsarSourceMaxParallelisms(`
`284`	`284`	`"Updating source {} max parallelism based on available partitions to {}",`
`285`	`285`	`sourceVertex,`
`286`	`286`	`numPartitions);`
`287`		`- topology.get(sourceVertex).updateMaxParallelism((int) numPartitions);`
	`287`	`+ topology.get(sourceVertex).setNumPartitions((int) numPartitions);`
`288`	`288`	`}`
`289`	`289`	`}`
`290`	`290`	`}`