Skip to content

Commit 6f01454

Browse files
committed
[FLINK-30571] Estimate scalability coefficient from past scaling history using linear regression
1 parent 0e65a5a commit 6f01454

File tree

4 files changed

+495
-0
lines changed

4 files changed

+495
-0
lines changed

flink-autoscaler/src/main/java/org/apache/flink/autoscaler/JobVertexScaler.java

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,24 @@
3434
import org.slf4j.Logger;
3535
import org.slf4j.LoggerFactory;
3636

37+
import java.math.BigDecimal;
38+
import java.math.RoundingMode;
3739
import java.time.Clock;
3840
import java.time.Duration;
3941
import java.time.Instant;
4042
import java.time.ZoneId;
43+
import java.util.ArrayList;
4144
import java.util.Collection;
45+
import java.util.List;
4246
import java.util.Map;
4347
import java.util.Objects;
4448
import java.util.SortedMap;
4549

4650
import static org.apache.flink.autoscaler.JobVertexScaler.KeyGroupOrPartitionsAdjustMode.MAXIMIZE_UTILISATION;
4751
import static org.apache.flink.autoscaler.config.AutoScalerOptions.MAX_SCALE_DOWN_FACTOR;
4852
import static org.apache.flink.autoscaler.config.AutoScalerOptions.MAX_SCALE_UP_FACTOR;
53+
import static org.apache.flink.autoscaler.config.AutoScalerOptions.OBSERVED_SCALABILITY_ENABLED;
54+
import static org.apache.flink.autoscaler.config.AutoScalerOptions.OBSERVED_SCALABILITY_MIN_OBSERVATIONS;
4955
import static org.apache.flink.autoscaler.config.AutoScalerOptions.SCALE_DOWN_INTERVAL;
5056
import static org.apache.flink.autoscaler.config.AutoScalerOptions.SCALING_EVENT_INTERVAL;
5157
import static org.apache.flink.autoscaler.config.AutoScalerOptions.SCALING_KEY_GROUP_PARTITIONS_ADJUST_MODE;
@@ -178,6 +184,12 @@ public ParallelismChange computeScaleTargetParallelism(
178184

179185
LOG.debug("Target processing capacity for {} is {}", vertex, targetCapacity);
180186
double scaleFactor = targetCapacity / averageTrueProcessingRate;
187+
if (conf.get(OBSERVED_SCALABILITY_ENABLED)) {
188+
double scalingCoefficient =
189+
JobVertexScaler.calculateObservedScalingCoefficient(
190+
history, conf.get(OBSERVED_SCALABILITY_MIN_OBSERVATIONS));
191+
scaleFactor = scaleFactor / scalingCoefficient;
192+
}
181193
double minScaleFactor = 1 - conf.get(MAX_SCALE_DOWN_FACTOR);
182194
double maxScaleFactor = 1 + conf.get(MAX_SCALE_UP_FACTOR);
183195
if (scaleFactor < minScaleFactor) {
@@ -236,6 +248,97 @@ public ParallelismChange computeScaleTargetParallelism(
236248
delayedScaleDown);
237249
}
238250

251+
/**
252+
* Calculates the scaling coefficient based on historical scaling data.
253+
*
254+
* <p>The scaling coefficient is computed using a weighted least squares approach, where more
255+
* recent data points and those with higher parallelism are given higher weights. If there are
256+
* not enough observations, or if the computed coefficient is invalid, a default value of {@code
257+
* 1.0} is returned, assuming linear scaling.
258+
*
259+
* @param history A {@code SortedMap} of {@code Instant} timestamps to {@code ScalingSummary}
260+
* @param minObservations The minimum number of observations required to compute the scaling
261+
* coefficient. If the number of historical entries is less than this threshold, a default
262+
* coefficient of {@code 1.0} is returned.
263+
* @return The computed scaling coefficient.
264+
*/
265+
@VisibleForTesting
266+
protected static double calculateObservedScalingCoefficient(
267+
SortedMap<Instant, ScalingSummary> history, int minObservations) {
268+
/*
269+
* The scaling coefficient is computed using a **weighted least squares** approach
270+
* to fit a linear model:
271+
*
272+
* R_i = β * P_i * α
273+
*
274+
* where:
275+
* - R_i = observed processing rate
276+
* - P_i = parallelism
277+
* - β = baseline processing rate
278+
* - α = scaling coefficient to optimize
279+
*
280+
* The optimization minimizes the **weighted sum of squared errors**:
281+
*
282+
* Loss = ∑ w_i * (R_i - β * α * P_i)^2
283+
*
284+
* Differentiating w.r.t. α and solving for α:
285+
*
286+
* α = ∑ (w_i * P_i * R_i) / (∑ (w_i * P_i^2) * β)
287+
*
288+
* We keep the system conservative for higher returns scenario by clamping computed α within 1.0.
289+
*/
290+
291+
// not enough data to compute scaling coefficient. we assume linear scaling.
292+
if (history.isEmpty() || history.size() < minObservations) {
293+
return 1.0;
294+
}
295+
296+
var baselineProcessingRate = AutoScalerUtils.computeBaselineProcessingRate(history);
297+
298+
if (Double.isNaN(baselineProcessingRate)) {
299+
return 1.0;
300+
}
301+
302+
Instant latestTimestamp = history.lastKey();
303+
304+
List<Double> parallelismList = new ArrayList<>();
305+
List<Double> processingRateList = new ArrayList<>();
306+
List<Double> weightList = new ArrayList<>();
307+
308+
for (Map.Entry<Instant, ScalingSummary> entry : history.entrySet()) {
309+
Instant timestamp = entry.getKey();
310+
ScalingSummary summary = entry.getValue();
311+
double parallelism = summary.getCurrentParallelism();
312+
double processingRate = summary.getMetrics().get(TRUE_PROCESSING_RATE).getAverage();
313+
314+
if (Double.isNaN(processingRate)) {
315+
LOG.warn(
316+
"True processing rate is not available in scaling history. Cannot compute scaling coefficient.");
317+
return 1.0;
318+
}
319+
320+
// Compute weight based on recency & parallelism
321+
double timeDiff =
322+
Duration.between(timestamp, latestTimestamp).getSeconds()
323+
+ 1; // Avoid division by zero
324+
double weight = parallelism / timeDiff;
325+
326+
parallelismList.add(parallelism);
327+
processingRateList.add(processingRate);
328+
weightList.add(weight);
329+
}
330+
331+
var coefficient =
332+
AutoScalerUtils.optimizeLinearScalingCoefficient(
333+
parallelismList,
334+
processingRateList,
335+
weightList,
336+
baselineProcessingRate,
337+
1,
338+
0.01);
339+
return BigDecimal.valueOf(coefficient).setScale(2, RoundingMode.CEILING).doubleValue();
340+
}
341+
239342
private ParallelismChange detectBlockScaling(
240343
Context context,
241344
JobVertexID vertex,

flink-autoscaler/src/main/java/org/apache/flink/autoscaler/config/AutoScalerOptions.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,4 +382,20 @@ private static ConfigOptions.OptionBuilder autoScalerConfig(String key) {
382382
"scaling.key-group.partitions.adjust.mode"))
383383
.withDescription(
384384
"How to adjust the parallelism of Source vertex or upstream shuffle is keyBy");
385+
386+
public static final ConfigOption<Boolean> OBSERVED_SCALABILITY_ENABLED =
387+
autoScalerConfig("observed-scalability.enabled")
388+
.booleanType()
389+
.defaultValue(false)
390+
.withFallbackKeys(oldOperatorConfigKey("observed-scalability.enabled"))
391+
.withDescription(
392+
"Enables the use of an observed scalability coefficient when computing target parallelism. If enabled, the system will estimate the scalability coefficient based on historical scaling data instead of assuming perfect linear scaling. This helps account for real-world inefficiencies such as network overhead and coordination costs.");
393+
394+
public static final ConfigOption<Integer> OBSERVED_SCALABILITY_MIN_OBSERVATIONS =
395+
autoScalerConfig("observed-scalability.min-observations")
396+
.intType()
397+
.defaultValue(5)
398+
.withFallbackKeys(oldOperatorConfigKey("observed-scalability.min-observations"))
399+
.withDescription(
400+
"Defines the minimum number of historical scaling observations required to estimate the scalability coefficient. If the number of available observations is below this threshold, the system falls back to assuming linear scaling.");
385401
}

flink-autoscaler/src/main/java/org/apache/flink/autoscaler/utils/AutoScalerUtils.java

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,27 @@
1717

1818
package org.apache.flink.autoscaler.utils;
1919

20+
import org.apache.flink.autoscaler.ScalingSummary;
2021
import org.apache.flink.autoscaler.config.AutoScalerOptions;
2122
import org.apache.flink.autoscaler.metrics.EvaluatedScalingMetric;
2223
import org.apache.flink.autoscaler.metrics.ScalingMetric;
2324
import org.apache.flink.configuration.Configuration;
2425
import org.apache.flink.runtime.jobgraph.JobVertexID;
2526

2627
import java.time.Duration;
28+
import java.time.Instant;
2729
import java.util.ArrayList;
2830
import java.util.Collection;
2931
import java.util.HashSet;
3032
import java.util.List;
3133
import java.util.Map;
34+
import java.util.NavigableMap;
3235
import java.util.Set;
36+
import java.util.SortedMap;
3337

3438
import static org.apache.flink.autoscaler.metrics.ScalingMetric.CATCH_UP_DATA_RATE;
3539
import static org.apache.flink.autoscaler.metrics.ScalingMetric.TARGET_DATA_RATE;
40+
import static org.apache.flink.autoscaler.metrics.ScalingMetric.TRUE_PROCESSING_RATE;
3641

3742
/** AutoScaler utilities. */
3843
public class AutoScalerUtils {
@@ -94,4 +99,94 @@ public static boolean excludeVerticesFromScaling(
9499
conf.set(AutoScalerOptions.VERTEX_EXCLUDE_IDS, new ArrayList<>(excludedIds));
95100
return anyAdded;
96101
}
102+
103+
/**
104+
* Computes the optimized linear scaling coefficient (α) by minimizing the weighted least
105+
* squares error.
106+
*
107+
* <p>This method estimates the scaling coefficient in a linear scaling model by fitting
108+
* observed processing rates and parallelism levels while applying weights to account for
109+
* recency and significance.
110+
*
111+
* <p>The computed coefficient is clamped within the specified lower and upper bounds to ensure
112+
* stability and prevent extreme scaling adjustments.
113+
*
114+
* @param parallelismLevels List of parallelism levels.
115+
* @param processingRates List of observed processing rates.
116+
* @param weights List of weights for each observation.
117+
* @param baselineProcessingRate Baseline processing rate.
118+
* @param upperBound Maximum allowable value for the scaling coefficient.
119+
* @param lowerBound Minimum allowable value for the scaling coefficient.
120+
* @return The optimized scaling coefficient (α), constrained within {@code [lowerBound,
121+
* upperBound]}.
122+
*/
123+
public static double optimizeLinearScalingCoefficient(
124+
List<Double> parallelismLevels,
125+
List<Double> processingRates,
126+
List<Double> weights,
127+
double baselineProcessingRate,
128+
double upperBound,
129+
double lowerBound) {
130+
131+
double weightedSum = 0.0;
132+
double weightedSquaredSum = 0.0;
133+
134+
for (int i = 0; i < parallelismLevels.size(); i++) {
135+
double parallelism = parallelismLevels.get(i);
136+
double processingRate = processingRates.get(i);
137+
double weight = weights.get(i);
138+
139+
weightedSum += weight * parallelism * processingRate;
140+
weightedSquaredSum += weight * parallelism * parallelism;
141+
}
142+
143+
if (weightedSquaredSum == 0.0) {
144+
return 1.0; // Fallback to linear scaling if denominator is zero
145+
}
146+
147+
double alpha = weightedSum / (weightedSquaredSum * baselineProcessingRate);
148+
149+
return Math.max(lowerBound, Math.min(upperBound, alpha));
150+
}
151+
152+
/**
153+
* Computes the baseline processing rate from historical scaling data.
154+
*
155+
* <p>The baseline processing rate represents the **processing rate per unit of parallelism**.
156+
* It is determined using the smallest observed parallelism in the history.
157+
*
158+
* @param history A {@code SortedMap} where keys are timestamps ({@code Instant}), and values
159+
* are {@code ScalingSummary} objects.
160+
* @return The computed baseline processing rate (processing rate per unit of parallelism).
161+
*/
162+
public static double computeBaselineProcessingRate(SortedMap<Instant, ScalingSummary> history) {
163+
ScalingSummary latestSmallestParallelismSummary = null;
164+
165+
for (Map.Entry<Instant, ScalingSummary> entry :
166+
((NavigableMap<Instant, ScalingSummary>) history).descendingMap().entrySet()) {
167+
ScalingSummary summary = entry.getValue();
168+
double parallelism = summary.getCurrentParallelism();
169+
170+
if (parallelism == 1) {
171+
return summary.getMetrics().get(TRUE_PROCESSING_RATE).getAverage();
172+
}
173+
174+
if (latestSmallestParallelismSummary == null
175+
|| parallelism < latestSmallestParallelismSummary.getCurrentParallelism()) {
176+
latestSmallestParallelismSummary = entry.getValue();
177+
}
178+
}
179+
180+
if (latestSmallestParallelismSummary == null) {
181+
return Double.NaN;
182+
}
183+
184+
double parallelism = latestSmallestParallelismSummary.getCurrentParallelism();
185+
double processingRate =
186+
latestSmallestParallelismSummary
187+
.getMetrics()
188+
.get(TRUE_PROCESSING_RATE)
189+
.getAverage();
190+
return processingRate / parallelism;
191+
}
97192
}

0 commit comments

Comments
 (0)