|
18 | 18 | package org.apache.flink.kubernetes.operator.controller; |
19 | 19 |
|
20 | 20 | import org.apache.flink.configuration.Configuration; |
| 21 | +import org.apache.flink.kubernetes.operator.OperatorTestBase; |
21 | 22 | import org.apache.flink.kubernetes.operator.TestUtils; |
22 | | -import org.apache.flink.kubernetes.operator.TestingFlinkService; |
23 | 23 | import org.apache.flink.kubernetes.operator.api.FlinkDeployment; |
24 | 24 | import org.apache.flink.kubernetes.operator.api.spec.FlinkVersion; |
25 | 25 | import org.apache.flink.kubernetes.operator.api.spec.UpgradeMode; |
| 26 | +import org.apache.flink.kubernetes.operator.api.status.CheckpointInfo; |
| 27 | +import org.apache.flink.kubernetes.operator.api.status.FlinkDeploymentStatus; |
26 | 28 | import org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus; |
| 29 | +import org.apache.flink.kubernetes.operator.api.status.SnapshotTriggerType; |
27 | 30 | import org.apache.flink.kubernetes.operator.config.FlinkConfigManager; |
| 31 | +import org.apache.flink.kubernetes.operator.observer.SnapshotObserver; |
| 32 | +import org.apache.flink.kubernetes.operator.utils.SnapshotStatus; |
| 33 | +import org.apache.flink.kubernetes.operator.utils.SnapshotUtils; |
| 34 | +import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; |
28 | 35 |
|
29 | 36 | import io.fabric8.kubernetes.client.KubernetesClient; |
30 | 37 | import io.fabric8.kubernetes.client.server.mock.EnableKubernetesMockClient; |
31 | 38 | import io.javaoperatorsdk.operator.api.reconciler.Context; |
| 39 | +import lombok.Getter; |
32 | 40 | import org.junit.jupiter.api.BeforeEach; |
33 | 41 | import org.junit.jupiter.params.ParameterizedTest; |
| 42 | +import org.junit.jupiter.params.provider.EnumSource; |
34 | 43 | import org.junit.jupiter.params.provider.MethodSource; |
35 | 44 |
|
36 | 45 | import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.OPERATOR_JOB_RESTART_FAILED; |
| 46 | +import static org.apache.flink.kubernetes.operator.reconciler.SnapshotType.CHECKPOINT; |
37 | 47 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| 48 | +import static org.junit.jupiter.api.Assertions.assertFalse; |
| 49 | +import static org.junit.jupiter.api.Assertions.assertNotNull; |
| 50 | +import static org.junit.jupiter.api.Assertions.assertNull; |
38 | 51 |
|
39 | 52 | /** |
40 | 53 | * @link Unhealthy deployment restart tests |
41 | 54 | */ |
42 | 55 | @EnableKubernetesMockClient(crud = true) |
43 | | -public class FailedDeploymentRestartTest { |
| 56 | +public class FailedDeploymentRestartTest extends OperatorTestBase { |
44 | 57 | private FlinkConfigManager configManager; |
45 | 58 |
|
46 | | - private TestingFlinkService flinkService; |
47 | 59 | private Context<FlinkDeployment> context; |
48 | 60 | private TestingFlinkDeploymentController testController; |
| 61 | + private SnapshotObserver<FlinkDeployment, FlinkDeploymentStatus> observer; |
49 | 62 |
|
50 | | - private KubernetesClient kubernetesClient; |
| 63 | + @Getter private KubernetesClient kubernetesClient; |
51 | 64 |
|
52 | 65 | @BeforeEach |
53 | 66 | public void setup() { |
54 | 67 | var configuration = new Configuration(); |
55 | 68 | configuration.set(OPERATOR_JOB_RESTART_FAILED, true); |
56 | 69 | configManager = new FlinkConfigManager(configuration); |
57 | | - flinkService = new TestingFlinkService(kubernetesClient); |
58 | 70 | context = flinkService.getContext(); |
59 | 71 | testController = new TestingFlinkDeploymentController(configManager, flinkService); |
60 | 72 | kubernetesClient.resource(TestUtils.buildApplicationCluster()).createOrReplace(); |
| 73 | + observer = new SnapshotObserver<>(eventRecorder); |
61 | 74 | } |
62 | 75 |
|
63 | 76 | @ParameterizedTest |
@@ -98,4 +111,68 @@ public void verifyFailedApplicationRecovery(FlinkVersion flinkVersion, UpgradeMo |
98 | 111 | appCluster.getSpec(), |
99 | 112 | appCluster.getStatus().getReconciliationStatus().deserializeLastReconciledSpec()); |
100 | 113 | } |
| 114 | + |
| 115 | + @ParameterizedTest |
| 116 | + @EnumSource(UpgradeMode.class) |
| 117 | + public void verifyFailedApplicationRecoveryWithCheckpoint(UpgradeMode upgradeMode) |
| 118 | + throws Exception { |
| 119 | + FlinkDeployment appCluster = TestUtils.buildApplicationCluster(); |
| 120 | + appCluster.getSpec().getJob().setUpgradeMode(upgradeMode); |
| 121 | + |
| 122 | + // Start a healthy deployment |
| 123 | + testController.reconcile(appCluster, context); |
| 124 | + testController.reconcile(appCluster, context); |
| 125 | + testController.reconcile(appCluster, context); |
| 126 | + |
| 127 | + // Mark job_id |
| 128 | + String jobId = appCluster.getStatus().getJobStatus().getJobId(); |
| 129 | + assertNotNull(jobId); |
| 130 | + assertEquals( |
| 131 | + JobManagerDeploymentStatus.READY, |
| 132 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 133 | + assertEquals("RUNNING", appCluster.getStatus().getJobStatus().getState()); |
| 134 | + assertNull(flinkService.getSubmittedConf().get(SavepointConfigOptions.SAVEPOINT_PATH)); |
| 135 | + |
| 136 | + // trigger checkpoint |
| 137 | + CheckpointInfo checkpointInfo = appCluster.getStatus().getJobStatus().getCheckpointInfo(); |
| 138 | + flinkService.triggerCheckpoint( |
| 139 | + null, |
| 140 | + SnapshotTriggerType.PERIODIC, |
| 141 | + checkpointInfo, |
| 142 | + configManager.getObserveConfig(appCluster)); |
| 143 | + |
| 144 | + // Pending |
| 145 | + observer.observeCheckpointStatus(getResourceContext(appCluster)); |
| 146 | + // Completed |
| 147 | + observer.observeCheckpointStatus(getResourceContext(appCluster)); |
| 148 | + assertFalse(SnapshotUtils.checkpointInProgress(appCluster.getStatus().getJobStatus())); |
| 149 | + assertEquals( |
| 150 | + SnapshotUtils.getLastSnapshotStatus(appCluster, CHECKPOINT), |
| 151 | + SnapshotStatus.SUCCEEDED); |
| 152 | + |
| 153 | + // Make deployment unhealthy |
| 154 | + flinkService.markApplicationJobFailedWithError( |
| 155 | + flinkService.listJobs().get(0).f1.getJobId(), "Failed job"); |
| 156 | + testController.reconcile(appCluster, context); |
| 157 | + assertEquals( |
| 158 | + JobManagerDeploymentStatus.DEPLOYING, |
| 159 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 160 | + |
| 161 | + // After restart the deployment is healthy again |
| 162 | + testController.reconcile(appCluster, context); |
| 163 | + testController.reconcile(appCluster, context); |
| 164 | + assertEquals( |
| 165 | + JobManagerDeploymentStatus.READY, |
| 166 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 167 | + assertEquals("RUNNING", appCluster.getStatus().getJobStatus().getState()); |
| 168 | + |
| 169 | + // check savepoint_path |
| 170 | + if (upgradeMode != UpgradeMode.STATELESS) { |
| 171 | + assertEquals( |
| 172 | + flinkService.getSubmittedConf().get(SavepointConfigOptions.SAVEPOINT_PATH), |
| 173 | + "ck_0"); |
| 174 | + } else { |
| 175 | + assertNull(flinkService.getSubmittedConf().get(SavepointConfigOptions.SAVEPOINT_PATH)); |
| 176 | + } |
| 177 | + } |
101 | 178 | } |
0 commit comments