Skip to content

Commit cb90b10

Browse files
committed
[FLINK-35126] Rework default checkpoint progress check window
1 parent 2b82a93 commit cb90b10

File tree

6 files changed

+66
-38
lines changed

6 files changed

+66
-38
lines changed

docs/layouts/shortcodes/generated/dynamic_section.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@
2222
</tr>
2323
<tr>
2424
<td><h5>kubernetes.operator.cluster.health-check.checkpoint-progress.enabled</h5></td>
25-
<td style="word-wrap: break-word;">false</td>
25+
<td style="word-wrap: break-word;">true</td>
2626
<td>Boolean</td>
2727
<td>Whether to enable checkpoint progress health check for clusters.</td>
2828
</tr>
2929
<tr>
3030
<td><h5>kubernetes.operator.cluster.health-check.checkpoint-progress.window</h5></td>
31-
<td style="word-wrap: break-word;">5 min</td>
31+
<td style="word-wrap: break-word;">(none)</td>
3232
<td>Duration</td>
33-
<td>If no checkpoints are completed within the defined time window, the job is considered unhealthy. This must be bigger than checkpointing interval.</td>
33+
<td>If no checkpoints are completed within the defined time window, the job is considered unhealthy. The minimum window size is `max(checkpointingInterval, checkpointTimeout) * (tolerableCheckpointFailures + 2)`, which also serves as the default value when checkpointing is enabled. For example with checkpoint interval 10 minutes and 0 tolerable failures, the default progress check window will be 20 minutes.</td>
3434
</tr>
3535
<tr>
3636
<td><h5>kubernetes.operator.cluster.health-check.enabled</h5></td>

docs/layouts/shortcodes/generated/kubernetes_operator_config_configuration.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@
2222
</tr>
2323
<tr>
2424
<td><h5>kubernetes.operator.cluster.health-check.checkpoint-progress.enabled</h5></td>
25-
<td style="word-wrap: break-word;">false</td>
25+
<td style="word-wrap: break-word;">true</td>
2626
<td>Boolean</td>
2727
<td>Whether to enable checkpoint progress health check for clusters.</td>
2828
</tr>
2929
<tr>
3030
<td><h5>kubernetes.operator.cluster.health-check.checkpoint-progress.window</h5></td>
31-
<td style="word-wrap: break-word;">5 min</td>
31+
<td style="word-wrap: break-word;">(none)</td>
3232
<td>Duration</td>
33-
<td>If no checkpoints are completed within the defined time window, the job is considered unhealthy. This must be bigger than checkpointing interval.</td>
33+
<td>If no checkpoints are completed within the defined time window, the job is considered unhealthy. The minimum window size is `max(checkpointingInterval, checkpointTimeout) * (tolerableCheckpointFailures + 2)`, which also serves as the default value when checkpointing is enabled. For example with checkpoint interval 10 minutes and 0 tolerable failures, the default progress check window will be 20 minutes.</td>
3434
</tr>
3535
<tr>
3636
<td><h5>kubernetes.operator.cluster.health-check.enabled</h5></td>

flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/config/KubernetesOperatorConfigOptions.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ public static String operatorConfigKey(String key) {
502502
OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_ENABLED =
503503
operatorConfig("cluster.health-check.checkpoint-progress.enabled")
504504
.booleanType()
505-
.defaultValue(false)
505+
.defaultValue(true)
506506
.withDescription(
507507
"Whether to enable checkpoint progress health check for clusters.");
508508

@@ -511,9 +511,9 @@ public static String operatorConfigKey(String key) {
511511
OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW =
512512
operatorConfig("cluster.health-check.checkpoint-progress.window")
513513
.durationType()
514-
.defaultValue(Duration.ofMinutes(5))
514+
.noDefaultValue()
515515
.withDescription(
516-
"If no checkpoints are completed within the defined time window, the job is considered unhealthy. This must be bigger than checkpointing interval.");
516+
"If no checkpoints are completed within the defined time window, the job is considered unhealthy. The minimum window size is `max(checkpointingInterval, checkpointTimeout) * (tolerableCheckpointFailures + 2)`, which also serves as the default value when checkpointing is enabled. For example with checkpoint interval 10 minutes and 0 tolerable failures, the default progress check window will be 20 minutes.");
517517

518518
@Documentation.Section(SECTION_DYNAMIC)
519519
public static final ConfigOption<Boolean> OPERATOR_JOB_RESTART_FAILED =

flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/observer/ClusterHealthEvaluator.java

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -178,30 +178,46 @@ private boolean evaluateCheckpoints(
178178
return true;
179179
}
180180

181-
var completedCheckpointsCheckWindow =
182-
configuration.get(OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW);
181+
var windowOpt =
182+
configuration.getOptional(OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW);
183183

184184
CheckpointConfig checkpointConfig = new CheckpointConfig();
185185
checkpointConfig.configure(configuration);
186186
var checkpointingInterval = checkpointConfig.getCheckpointInterval();
187187
var checkpointingTimeout = checkpointConfig.getCheckpointTimeout();
188-
var tolerationFailureNumber = checkpointConfig.getTolerableCheckpointFailureNumber() + 1;
189-
var minCompletedCheckpointsCheckWindow =
190-
Math.max(
191-
checkpointingInterval * tolerationFailureNumber,
192-
checkpointingTimeout * tolerationFailureNumber);
193-
if (completedCheckpointsCheckWindow.toMillis() < minCompletedCheckpointsCheckWindow) {
194-
LOG.warn(
195-
"{} is not long enough. Default to max({} * {}, {} * {}): {}ms",
196-
OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW.key(),
197-
CHECKPOINTING_INTERVAL.key(),
198-
TOLERABLE_FAILURE_NUMBER.key(),
199-
CHECKPOINTING_TIMEOUT.key(),
200-
TOLERABLE_FAILURE_NUMBER.key(),
201-
minCompletedCheckpointsCheckWindow);
202-
completedCheckpointsCheckWindow = Duration.ofMillis(minCompletedCheckpointsCheckWindow);
188+
var tolerationFailureNumber = checkpointConfig.getTolerableCheckpointFailureNumber() + 2;
189+
var minCheckWindow =
190+
Duration.ofMillis(
191+
Math.max(
192+
checkpointingInterval * tolerationFailureNumber,
193+
checkpointingTimeout * tolerationFailureNumber));
194+
195+
if (windowOpt.isEmpty() && !checkpointConfig.isCheckpointingEnabled()) {
196+
// If no explicit checkpoint check window is specified and checkpointing is disabled
197+
// based on the config, we don't do anything
198+
return true;
203199
}
204200

201+
var completedCheckpointsCheckWindow =
202+
windowOpt
203+
.filter(
204+
d -> {
205+
if (d.compareTo(minCheckWindow) < 0) {
206+
LOG.debug(
207+
"{} is not long enough. Default to max({} * {}, {} * {}): {}",
208+
OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW
209+
.key(),
210+
CHECKPOINTING_INTERVAL.key(),
211+
TOLERABLE_FAILURE_NUMBER.key(),
212+
CHECKPOINTING_TIMEOUT.key(),
213+
TOLERABLE_FAILURE_NUMBER.key(),
214+
minCheckWindow);
215+
return false;
216+
}
217+
return true;
218+
})
219+
.orElse(minCheckWindow);
220+
205221
if (observedClusterHealthInfo.getNumCompletedCheckpoints()
206222
< lastValidClusterHealthInfo.getNumCompletedCheckpoints()) {
207223
LOG.debug(

flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/controller/UnhealthyDeploymentRestartTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ public void verifyApplicationNoCompletedCheckpointsJmRecovery(
138138
// Ensure the last savepoint has been taken more than 10 minutes ago (Default checkpoint
139139
// interval)
140140
clusterHealthInfo.setNumCompletedCheckpointsIncreasedTimeStamp(
141-
clusterHealthInfo.getNumCompletedCheckpointsIncreasedTimeStamp() - 600000);
141+
clusterHealthInfo.getNumCompletedCheckpointsIncreasedTimeStamp() - 1200000);
142142
setLastValidClusterHealthInfo(appCluster.getStatus().getClusterInfo(), clusterHealthInfo);
143143
testController.getStatusRecorder().patchAndCacheStatus(appCluster, kubernetesClient);
144144
testController.reconcile(appCluster, context);

flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/observer/ClusterHealthEvaluatorTest.java

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ public void evaluateShouldOverwriteRestartCountWhenTimestampIsOutOfWindow() {
159159
@Test
160160
public void evaluateShouldOverwriteCompletedCheckpointCountWhenLess() {
161161
configuration.set(OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_ENABLED, true);
162+
configuration.set(
163+
OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW, Duration.ofMinutes(5));
162164
var observedClusterHealthInfo1 = createClusterHealthInfo(validInstant1, 0, 1);
163165
var observedClusterHealthInfo2 = createClusterHealthInfo(validInstant2, 0, 0);
164166

@@ -278,39 +280,49 @@ private static Stream<Arguments> provideParametersEvaluateCheckpointing() {
278280
Instant tenSecInstant = ofEpochSecond(10);
279281
Instant twoMinInstant = ofEpochSecond(120);
280282
Instant fourMinInstant = twoMinInstant.plus(2, ChronoUnit.MINUTES);
283+
284+
Duration oneMin = Duration.ofMinutes(1);
281285
return Stream.of(
282286
// ShouldMarkClusterUnhealthyWhenNoCompletedCheckpointsOutsideWindow
283-
Arguments.of(twoMinInstant, fourMinInstant, 30L, 30L, null, false),
287+
Arguments.of(twoMinInstant, fourMinInstant, oneMin, 30L, 30L, null, false),
288+
// Verify checkpoint progress even if checkpointing not configured
289+
Arguments.of(twoMinInstant, fourMinInstant, oneMin, null, 30L, null, false),
290+
// Verify default window if not explicitly configured
291+
Arguments.of(twoMinInstant, fourMinInstant, null, 30L, 30L, null, false),
292+
// Verify check is off if both window and checkpointing is not configured
293+
Arguments.of(twoMinInstant, fourMinInstant, null, null, 30L, null, true),
284294
// ShouldMarkClusterHealthyWhenCompletedCheckpointsWithOutsideWindowFromCheckpointInterval
285-
Arguments.of(twoMinInstant, fourMinInstant, 120L, 30L, null, true),
295+
Arguments.of(twoMinInstant, fourMinInstant, oneMin, 60L, 30L, null, true),
286296
// ShouldMarkClusterUnhealthyWhenNoCompletedCheckpointsWithOutsideWindowFromCheckpointInterval
287-
Arguments.of(tenSecInstant, fourMinInstant, 120L, 30L, null, false),
297+
Arguments.of(tenSecInstant, fourMinInstant, oneMin, 60L, 30L, null, false),
288298
// ShouldMarkClusterHealthyWhenCompletedCheckpointsWithOutsideWindowFromCheckpointIntervalTimesNbTolerableFailure
289-
Arguments.of(twoMinInstant, fourMinInstant, 30L, 10L, 3, true),
299+
Arguments.of(twoMinInstant, fourMinInstant, oneMin, 30L, 10L, 3, true),
290300
// ShouldMarkClusterHealthyWhenNoCompletedCheckpointsWithOutsideWindowFromCheckpointIntervalTimesNbTolerableFailure
291-
Arguments.of(tenSecInstant, fourMinInstant, 30L, 10L, 3, false),
301+
Arguments.of(tenSecInstant, fourMinInstant, oneMin, 30L, 10L, 3, false),
292302
// ShouldMarkClusterHealthyWhenCompletedCheckpointsWithOutsideWindowFromCheckpointingTimeout
293-
Arguments.of(twoMinInstant, fourMinInstant, 30L, 120L, null, true),
303+
Arguments.of(twoMinInstant, fourMinInstant, oneMin, 30L, 60L, null, true),
294304
// ShouldMarkClusterHealthyWhenNoCompletedCheckpointsWithOutsideWindowFromCheckpointingTimeout
295-
Arguments.of(tenSecInstant, fourMinInstant, 30L, 120L, null, false),
305+
Arguments.of(tenSecInstant, fourMinInstant, oneMin, 30L, 60L, null, false),
296306
// ShouldMarkClusterHealthyWhenCompletedCheckpointsWithOutsideWindowFromCheckpointingTimeoutTimesNbTolerableFailure
297-
Arguments.of(twoMinInstant, fourMinInstant, 10L, 30L, 3, true),
307+
Arguments.of(twoMinInstant, fourMinInstant, oneMin, 10L, 30L, 3, true),
298308
// ShouldMarkClusterHealthyWhenNoCompletedCheckpointsWithOutsideWindowFromCheckpointingTimeoutTimesNbTolerableFailure
299-
Arguments.of(tenSecInstant, fourMinInstant, 10L, 30L, 3, false));
309+
Arguments.of(tenSecInstant, fourMinInstant, oneMin, 10L, 30L, 3, false));
300310
}
301311

302312
@ParameterizedTest
303313
@MethodSource("provideParametersEvaluateCheckpointing")
304314
public void evaluateCheckpointing(
305315
Instant validInstant1,
306316
Instant validInstant2,
317+
Duration window,
307318
Long checkpointingInterval,
308319
long checkpointingTimeout,
309320
Integer tolerationFailureNumber,
310321
boolean expectedIsHealthy) {
311322
configuration.set(OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_ENABLED, true);
312-
configuration.set(
313-
OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW, Duration.ofMinutes(1));
323+
if (window != null) {
324+
configuration.set(OPERATOR_CLUSTER_HEALTH_CHECK_CHECKPOINT_PROGRESS_WINDOW, window);
325+
}
314326
if (checkpointingInterval != null) {
315327
configuration.set(CHECKPOINTING_INTERVAL, Duration.ofSeconds(checkpointingInterval));
316328
}

0 commit comments

Comments
 (0)