apache · gyfora · Jul 28, 2025 · Jul 11, 2025 · Jul 13, 2025 · Jul 13, 2025
diff --git a/...rator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/CommonStatus.java b/...rator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/CommonStatus.java
@@ -90,6 +90,28 @@ public ResourceLifecycleState getLifecycleState() {
             return ResourceLifecycleState.FAILED;
         }
 
+        // Check for unrecoverable deployments that should be marked as FAILED
+        if (this instanceof FlinkDeploymentStatus) {
+            FlinkDeploymentStatus deploymentStatus = (FlinkDeploymentStatus) this;
+            var jmDeployStatus = deploymentStatus.getJobManagerDeploymentStatus();
+
+            // ERROR/MISSING deployments are in terminal error state
+            // [Configmaps deleted -> require manual restore]  and should always be FAILED
+            if ((jmDeployStatus == JobManagerDeploymentStatus.MISSING
+                            || jmDeployStatus == JobManagerDeploymentStatus.ERROR)
+                    && StringUtils.isNotEmpty(error)
+                    && (error.toLowerCase()
+                                    .contains(
+                                            "it is possible that the job has finished or terminally failed, or the configmaps have been deleted")
+                            || error.toLowerCase().contains("manual restore required")
+                            || error.toLowerCase().contains("ha metadata not available")
+                            || error.toLowerCase()
+                                    .contains(
+                                            "ha data is not available to make stateful upgrades"))) {
+                return ResourceLifecycleState.FAILED;
+            }
+        }
+
         if (reconciliationStatus.getState() == ReconciliationState.ROLLED_BACK) {
             return ResourceLifecycleState.ROLLED_BACK;
         } else if (reconciliationStatus.isLastReconciledSpecStable()) {

diff --git a/.../org/apache/flink/kubernetes/operator/metrics/lifecycle/ResourceLifecycleMetricsTest.java b/.../org/apache/flink/kubernetes/operator/metrics/lifecycle/ResourceLifecycleMetricsTest.java
@@ -26,6 +26,7 @@
 import org.apache.flink.kubernetes.operator.api.FlinkSessionJob;
 import org.apache.flink.kubernetes.operator.api.lifecycle.ResourceLifecycleState;
 import org.apache.flink.kubernetes.operator.api.spec.JobState;
+import org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus;
 import org.apache.flink.kubernetes.operator.api.status.ReconciliationState;
 import org.apache.flink.kubernetes.operator.config.FlinkOperatorConfiguration;
 import org.apache.flink.kubernetes.operator.metrics.CustomResourceMetrics;
@@ -337,4 +338,50 @@ private Map<ResourceLifecycleState, List<Histogram>> initTimeHistos() {
         }
         return histos;
     }
+
+    @Test
+    public void testUnrecoverableDeploymentLifecycleState() {
+        var application = TestUtils.buildApplicationCluster();
+
+        // Setup the deployment to simulate it has been deployed (so isBeforeFirstDeployment =
+        // false)
+        ReconciliationUtils.updateStatusForDeployedSpec(application, new Configuration());
+        application.getStatus().getReconciliationStatus().markReconciledSpecAsStable();
+
+        application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.ERROR);
+        application
+                .getStatus()
+                .setError(
+                        "JobManager deployment is missing and HA data is not available to make stateful upgrades. "
+                                + "It is possible that the job has finished or terminally failed, or the configmaps have been deleted. "
+                                + "Manual restore required.");
+        assertEquals(
+                FAILED,
+                application.getStatus().getLifecycleState(),
+                "ERROR deployment with `configmaps have been deleted` error should always be FAILED (terminal error state)");
+
+        application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
+        application
+                .getStatus()
+                .setError(
+                        "HA metadata not available to restore from last state. "
+                                + "It is possible that the job has finished or terminally failed, or the configmaps have been deleted. ");
+        assertEquals(
+                FAILED,
+                application.getStatus().getLifecycleState(),
+                "MISSING deployment with error should be FAILED");
+
+        application.getStatus().setError(null);
+        application.getStatus().setJobManagerDeploymentStatus(JobManagerDeploymentStatus.MISSING);
+        // Reset to DEPLOYED state (not stable yet) to simulate ongoing deployment
+        application.getStatus().getReconciliationStatus().setState(ReconciliationState.DEPLOYED);
+        application
+                .getStatus()
+                .getReconciliationStatus()
+                .setLastStableSpec(null); // Mark as not stable
+        assertEquals(
+                DEPLOYED,
+                application.getStatus().getLifecycleState(),
+                "MISSING deployment before stability should not be FAILED yet (still deploying)");
+    }
 }