Skip to content

Commit 150bee1

Browse files
author
nishita-pattanayak
committed
[FLINK-32033][Kubernetes-Operator] Fix Lifecycle status in case of MISSING/ERROR JM status with unrecoverable error
1 parent 188a27d commit 150bee1

File tree

2 files changed

+19
-55
lines changed

2 files changed

+19
-55
lines changed

flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/CommonStatus.java

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -93,25 +93,16 @@ public ResourceLifecycleState getLifecycleState() {
9393
// Check for unrecoverable deployments that should be marked as FAILED
9494
if (this instanceof FlinkDeploymentStatus) {
9595
FlinkDeploymentStatus deploymentStatus = (FlinkDeploymentStatus) this;
96-
var jmStatus = deploymentStatus.getJobManagerDeploymentStatus();
97-
98-
// ERROR deployments are in terminal error state and should always be FAILED
99-
if (jmStatus == JobManagerDeploymentStatus.ERROR) {
96+
var jmDeployStatus = deploymentStatus.getJobManagerDeploymentStatus();
97+
98+
// ERROR/MISSING deployments are in terminal error state
99+
// [Configmaps deleted -> require manual restore] and should always be FAILED
100+
if ((jmDeployStatus == JobManagerDeploymentStatus.MISSING
101+
|| jmDeployStatus == JobManagerDeploymentStatus.ERROR)
102+
&& StringUtils.isNotEmpty(error)
103+
&& error.contains("configmaps have been deleted")) {
100104
return ResourceLifecycleState.FAILED;
101105
}
102-
103-
// MISSING deployments should be FAILED if they're clearly unrecoverable
104-
if (jmStatus == JobManagerDeploymentStatus.MISSING) {
105-
// Mark as FAILED if error message clearly indicates deployment failure (any time)
106-
if (StringUtils.isNotEmpty(error)) {
107-
return ResourceLifecycleState.FAILED;
108-
}
109-
// Also mark as FAILED if stable deployment is missing without any error
110-
// (indicating it was deleted externally)
111-
else if (reconciliationStatus.isLastReconciledSpecStable()) {
112-
return ResourceLifecycleState.FAILED;
113-
}
114-
}
115106
}
116107

117108
if (reconciliationStatus.getState() == ReconciliationState.ROLLED_BACK) {

flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/metrics/lifecycle/ResourceLifecycleMetricsTest.java

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -72,28 +72,8 @@ public void lifecycleStateTest() {
7272
ReconciliationUtils.updateStatusForDeployedSpec(application, new Configuration());
7373
assertEquals(DEPLOYED, application.getStatus().getLifecycleState());
7474

75-
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.DEPLOYING);
7675
application.getStatus().getReconciliationStatus().markReconciledSpecAsStable();
77-
assertEquals(
78-
STABLE,
79-
application.getStatus().getLifecycleState(),
80-
"JobManager Deployment is in DEPLOYING state, hence application is STABLE");
81-
82-
application
83-
.getStatus()
84-
.setJobManagerDeploymentStatus(JobManagerDeploymentStatus.DEPLOYED_NOT_READY);
85-
application.getStatus().getReconciliationStatus().markReconciledSpecAsStable();
86-
assertEquals(
87-
STABLE,
88-
application.getStatus().getLifecycleState(),
89-
"JobManager Deployment is in DEPLOYED_NOT_READY state, hence application is STABLE");
90-
91-
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.READY);
92-
application.getStatus().getReconciliationStatus().markReconciledSpecAsStable();
93-
assertEquals(
94-
STABLE,
95-
application.getStatus().getLifecycleState(),
96-
"JobManager Deployment is in READY state, hence application is STABLE");
76+
assertEquals(STABLE, application.getStatus().getLifecycleState());
9777

9878
application.getStatus().setError("errr");
9979
assertEquals(STABLE, application.getStatus().getLifecycleState());
@@ -369,35 +349,28 @@ public void testUnrecoverableDeploymentLifecycleState() {
369349
application.getStatus().getReconciliationStatus().markReconciledSpecAsStable();
370350

371351
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.ERROR);
372-
application.getStatus().setError(null);
373-
assertEquals(
374-
FAILED,
375-
application.getStatus().getLifecycleState(),
376-
"ERROR deployment should always be FAILED (terminal error state)");
377-
378-
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.ERROR);
379-
application.getStatus().setError("JobManager deployment failed to start");
352+
application
353+
.getStatus()
354+
.setError(
355+
"JobManager deployment is missing and HA data is not available to make stateful upgrades. "
356+
+ "It is possible that the job has finished or terminally failed, or the configmaps have been deleted. "
357+
+ "Manual restore required.");
380358
assertEquals(
381359
FAILED,
382360
application.getStatus().getLifecycleState(),
383-
"ERROR deployment with error message should also be FAILED");
361+
"ERROR deployment with `configmaps have been deleted` error should always be FAILED (terminal error state)");
384362

385363
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
386364
application
387365
.getStatus()
388-
.setError("JobManager deployment was deleted and cannot be recovered");
366+
.setError(
367+
"HA metadata not available to restore from last state. "
368+
+ "It is possible that the job has finished or terminally failed, or the configmaps have been deleted. ");
389369
assertEquals(
390370
FAILED,
391371
application.getStatus().getLifecycleState(),
392372
"MISSING deployment with error should be FAILED");
393373

394-
application.getStatus().setError(null);
395-
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
396-
assertEquals(
397-
FAILED,
398-
application.getStatus().getLifecycleState(),
399-
"MISSING deployment with stable reconciliation should be FAILED");
400-
401374
application.getStatus().setError(null);
402375
application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
403376
// Reset to DEPLOYED state (not stable yet) to simulate ongoing deployment

0 commit comments

Comments
 (0)