-
Notifications
You must be signed in to change notification settings - Fork 498
[FLINK-36535][autoscaler] Optimize the scale down logic based on historical parallelism to reduce the rescale frequency #920
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -88,31 +88,22 @@ public JobVertexScaler(AutoScalerEventHandler<KEY, Context> autoScalerEventHandl | |
| this.autoScalerEventHandler = autoScalerEventHandler; | ||
| } | ||
|
|
||
| /** The parallelism change type of {@link ParallelismChange}. */ | ||
| public enum ParallelismChangeType { | ||
| NO_CHANGE, | ||
| REQUIRED_CHANGE, | ||
| OPTIONAL_CHANGE; | ||
| } | ||
|
|
||
| /** | ||
| * The rescaling will be triggered if any vertex's ParallelismChange is required. This means | ||
| * that if all vertices' ParallelismChange is optional, rescaling will be ignored. | ||
| */ | ||
| /** The rescaling will be triggered if any vertex's {@link ParallelismChange} is changed. */ | ||
| @Getter | ||
| public static class ParallelismChange { | ||
|
|
||
| private static final ParallelismChange NO_CHANGE = | ||
gyfora marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| new ParallelismChange(ParallelismChangeType.NO_CHANGE, -1); | ||
| private static final ParallelismChange NO_CHANGE = new ParallelismChange(-1); | ||
|
|
||
| private final ParallelismChangeType changeType; | ||
| private final int newParallelism; | ||
|
|
||
| private ParallelismChange(ParallelismChangeType changeType, int newParallelism) { | ||
| this.changeType = changeType; | ||
| private ParallelismChange(int newParallelism) { | ||
| this.newParallelism = newParallelism; | ||
| } | ||
|
|
||
| public boolean isNoChange() { | ||
| return this == NO_CHANGE; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
|
|
@@ -122,30 +113,24 @@ public boolean equals(Object o) { | |
| return false; | ||
| } | ||
| ParallelismChange that = (ParallelismChange) o; | ||
| return changeType == that.changeType && newParallelism == that.newParallelism; | ||
| return newParallelism == that.newParallelism; | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(changeType, newParallelism); | ||
| return Objects.hash(newParallelism); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "ParallelismChange{" | ||
| + "changeType=" | ||
| + changeType | ||
| + ", newParallelism=" | ||
| + newParallelism | ||
| + '}'; | ||
| return isNoChange() | ||
| ? "NoParallelismChange" | ||
| : "ParallelismChange{newParallelism=" + newParallelism + '}'; | ||
| } | ||
|
|
||
| public static ParallelismChange required(int newParallelism) { | ||
| return new ParallelismChange(ParallelismChangeType.REQUIRED_CHANGE, newParallelism); | ||
| } | ||
|
|
||
| public static ParallelismChange optional(int newParallelism) { | ||
| return new ParallelismChange(ParallelismChangeType.OPTIONAL_CHANGE, newParallelism); | ||
| public static ParallelismChange build(int newParallelism) { | ||
| checkArgument(newParallelism > 0, "The parallelism should be greater than 0."); | ||
| return new ParallelismChange(newParallelism); | ||
| } | ||
|
|
||
| public static ParallelismChange noChange() { | ||
|
|
@@ -263,7 +248,7 @@ private ParallelismChange detectBlockScaling( | |
|
|
||
| // If we don't have past scaling actions for this vertex, don't block scale up. | ||
| if (history.isEmpty()) { | ||
| return ParallelismChange.required(newParallelism); | ||
| return ParallelismChange.build(newParallelism); | ||
| } | ||
|
|
||
| var lastSummary = history.get(history.lastKey()); | ||
|
|
@@ -275,7 +260,7 @@ && detectIneffectiveScaleUp( | |
| return ParallelismChange.noChange(); | ||
| } | ||
|
|
||
| return ParallelismChange.required(newParallelism); | ||
| return ParallelismChange.build(newParallelism); | ||
| } else { | ||
| return applyScaleDownInterval(delayedScaleDown, vertex, conf, newParallelism); | ||
| } | ||
|
|
@@ -289,21 +274,26 @@ private ParallelismChange applyScaleDownInterval( | |
| var scaleDownInterval = conf.get(SCALE_DOWN_INTERVAL); | ||
| if (scaleDownInterval.toMillis() <= 0) { | ||
| // The scale down interval is disable, so don't block scaling. | ||
| return ParallelismChange.required(newParallelism); | ||
| } | ||
|
|
||
| var firstTriggerTime = delayedScaleDown.getFirstTriggerTimeForVertex(vertex); | ||
| if (firstTriggerTime.isEmpty()) { | ||
| LOG.info("The scale down of {} is delayed by {}.", vertex, scaleDownInterval); | ||
| delayedScaleDown.updateTriggerTime(vertex, clock.instant()); | ||
| return ParallelismChange.optional(newParallelism); | ||
| return ParallelismChange.build(newParallelism); | ||
| } | ||
|
|
||
| if (clock.instant().isBefore(firstTriggerTime.get().plus(scaleDownInterval))) { | ||
| LOG.debug("Try to skip immediate scale down within scale-down interval for {}", vertex); | ||
| return ParallelismChange.optional(newParallelism); | ||
| var now = clock.instant(); | ||
| var delayedScaleDownInfo = delayedScaleDown.triggerScaleDown(vertex, now, newParallelism); | ||
|
|
||
| // Never scale down within scale down interval | ||
| if (now.isBefore(delayedScaleDownInfo.getFirstTriggerTime().plus(scaleDownInterval))) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we rename this method to
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it a typo? Do you mean rename the I prefer to use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought that I'm not sure this is correct. We want to delay scale down from the first time we scale up, not the first time we scaled down.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @mxm for the clarification! Actually, we expect to use the first scale down time instead of scale up time as the first trigger time.
As I understand, the scale down interval wanna to reduce the scale down frequency. Please correct me if anything is wrong, thank you~ |
||
| if (now.equals(delayedScaleDownInfo.getFirstTriggerTime())) { | ||
| LOG.info("The scale down of {} is delayed by {}.", vertex, scaleDownInterval); | ||
| } else { | ||
| LOG.debug( | ||
| "Try to skip immediate scale down within scale-down interval for {}", | ||
| vertex); | ||
| } | ||
| return ParallelismChange.noChange(); | ||
gyfora marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } else { | ||
| return ParallelismChange.required(newParallelism); | ||
| // Using the maximum parallelism within the scale down interval window instead of the | ||
| // latest parallelism when scaling down | ||
| return ParallelismChange.build(delayedScaleDownInfo.getMaxRecommendedParallelism()); | ||
mxm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.