|
18 | 18 | package org.apache.flink.kubernetes.operator.controller;
|
19 | 19 |
|
20 | 20 | import org.apache.flink.configuration.Configuration;
|
| 21 | +import org.apache.flink.kubernetes.operator.OperatorTestBase; |
21 | 22 | import org.apache.flink.kubernetes.operator.TestUtils;
|
22 |
| -import org.apache.flink.kubernetes.operator.TestingFlinkService; |
23 | 23 | import org.apache.flink.kubernetes.operator.api.FlinkDeployment;
|
24 | 24 | import org.apache.flink.kubernetes.operator.api.spec.FlinkVersion;
|
25 | 25 | import org.apache.flink.kubernetes.operator.api.spec.UpgradeMode;
|
| 26 | +import org.apache.flink.kubernetes.operator.api.status.CheckpointInfo; |
| 27 | +import org.apache.flink.kubernetes.operator.api.status.FlinkDeploymentStatus; |
26 | 28 | import org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus;
|
| 29 | +import org.apache.flink.kubernetes.operator.api.status.SnapshotTriggerType; |
27 | 30 | import org.apache.flink.kubernetes.operator.config.FlinkConfigManager;
|
| 31 | +import org.apache.flink.kubernetes.operator.observer.SnapshotObserver; |
| 32 | +import org.apache.flink.kubernetes.operator.utils.SnapshotStatus; |
| 33 | +import org.apache.flink.kubernetes.operator.utils.SnapshotUtils; |
| 34 | +import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; |
28 | 35 |
|
29 | 36 | import io.fabric8.kubernetes.client.KubernetesClient;
|
30 | 37 | import io.fabric8.kubernetes.client.server.mock.EnableKubernetesMockClient;
|
31 | 38 | import io.javaoperatorsdk.operator.api.reconciler.Context;
|
| 39 | +import lombok.Getter; |
32 | 40 | import org.junit.jupiter.api.BeforeEach;
|
33 | 41 | import org.junit.jupiter.params.ParameterizedTest;
|
| 42 | +import org.junit.jupiter.params.provider.EnumSource; |
34 | 43 | import org.junit.jupiter.params.provider.MethodSource;
|
35 | 44 |
|
36 | 45 | import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.OPERATOR_JOB_RESTART_FAILED;
|
| 46 | +import static org.apache.flink.kubernetes.operator.reconciler.SnapshotType.CHECKPOINT; |
37 | 47 | import static org.junit.jupiter.api.Assertions.assertEquals;
|
| 48 | +import static org.junit.jupiter.api.Assertions.assertFalse; |
| 49 | +import static org.junit.jupiter.api.Assertions.assertNotNull; |
| 50 | +import static org.junit.jupiter.api.Assertions.assertNull; |
38 | 51 |
|
39 | 52 | /**
|
40 | 53 | * @link Unhealthy deployment restart tests
|
41 | 54 | */
|
42 | 55 | @EnableKubernetesMockClient(crud = true)
|
43 |
| -public class FailedDeploymentRestartTest { |
| 56 | +public class FailedDeploymentRestartTest extends OperatorTestBase { |
44 | 57 | private FlinkConfigManager configManager;
|
45 | 58 |
|
46 |
| - private TestingFlinkService flinkService; |
47 | 59 | private Context<FlinkDeployment> context;
|
48 | 60 | private TestingFlinkDeploymentController testController;
|
| 61 | + private SnapshotObserver<FlinkDeployment, FlinkDeploymentStatus> observer; |
49 | 62 |
|
50 |
| - private KubernetesClient kubernetesClient; |
| 63 | + @Getter private KubernetesClient kubernetesClient; |
51 | 64 |
|
52 | 65 | @BeforeEach
|
53 | 66 | public void setup() {
|
54 | 67 | var configuration = new Configuration();
|
55 | 68 | configuration.set(OPERATOR_JOB_RESTART_FAILED, true);
|
56 | 69 | configManager = new FlinkConfigManager(configuration);
|
57 |
| - flinkService = new TestingFlinkService(kubernetesClient); |
58 | 70 | context = flinkService.getContext();
|
59 | 71 | testController = new TestingFlinkDeploymentController(configManager, flinkService);
|
60 | 72 | kubernetesClient.resource(TestUtils.buildApplicationCluster()).createOrReplace();
|
| 73 | + observer = new SnapshotObserver<>(eventRecorder); |
61 | 74 | }
|
62 | 75 |
|
63 | 76 | @ParameterizedTest
|
@@ -98,4 +111,68 @@ public void verifyFailedApplicationRecovery(FlinkVersion flinkVersion, UpgradeMo
|
98 | 111 | appCluster.getSpec(),
|
99 | 112 | appCluster.getStatus().getReconciliationStatus().deserializeLastReconciledSpec());
|
100 | 113 | }
|
| 114 | + |
| 115 | + @ParameterizedTest |
| 116 | + @EnumSource(UpgradeMode.class) |
| 117 | + public void verifyFailedApplicationRecoveryWithCheckpoint(UpgradeMode upgradeMode) |
| 118 | + throws Exception { |
| 119 | + FlinkDeployment appCluster = TestUtils.buildApplicationCluster(); |
| 120 | + appCluster.getSpec().getJob().setUpgradeMode(upgradeMode); |
| 121 | + |
| 122 | + // Start a healthy deployment |
| 123 | + testController.reconcile(appCluster, context); |
| 124 | + testController.reconcile(appCluster, context); |
| 125 | + testController.reconcile(appCluster, context); |
| 126 | + |
| 127 | + // Mark job_id |
| 128 | + String jobId = appCluster.getStatus().getJobStatus().getJobId(); |
| 129 | + assertNotNull(jobId); |
| 130 | + assertEquals( |
| 131 | + JobManagerDeploymentStatus.READY, |
| 132 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 133 | + assertEquals("RUNNING", appCluster.getStatus().getJobStatus().getState()); |
| 134 | + assertNull(flinkService.getSubmittedConf().get(SavepointConfigOptions.SAVEPOINT_PATH)); |
| 135 | + |
| 136 | + // trigger checkpoint |
| 137 | + CheckpointInfo checkpointInfo = appCluster.getStatus().getJobStatus().getCheckpointInfo(); |
| 138 | + flinkService.triggerCheckpoint( |
| 139 | + null, |
| 140 | + SnapshotTriggerType.PERIODIC, |
| 141 | + checkpointInfo, |
| 142 | + configManager.getObserveConfig(appCluster)); |
| 143 | + |
| 144 | + // Pending |
| 145 | + observer.observeCheckpointStatus(getResourceContext(appCluster)); |
| 146 | + // Completed |
| 147 | + observer.observeCheckpointStatus(getResourceContext(appCluster)); |
| 148 | + assertFalse(SnapshotUtils.checkpointInProgress(appCluster.getStatus().getJobStatus())); |
| 149 | + assertEquals( |
| 150 | + SnapshotUtils.getLastSnapshotStatus(appCluster, CHECKPOINT), |
| 151 | + SnapshotStatus.SUCCEEDED); |
| 152 | + |
| 153 | + // Make deployment unhealthy |
| 154 | + flinkService.markApplicationJobFailedWithError( |
| 155 | + flinkService.listJobs().get(0).f1.getJobId(), "Failed job"); |
| 156 | + testController.reconcile(appCluster, context); |
| 157 | + assertEquals( |
| 158 | + JobManagerDeploymentStatus.DEPLOYING, |
| 159 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 160 | + |
| 161 | + // After restart the deployment is healthy again |
| 162 | + testController.reconcile(appCluster, context); |
| 163 | + testController.reconcile(appCluster, context); |
| 164 | + assertEquals( |
| 165 | + JobManagerDeploymentStatus.READY, |
| 166 | + appCluster.getStatus().getJobManagerDeploymentStatus()); |
| 167 | + assertEquals("RUNNING", appCluster.getStatus().getJobStatus().getState()); |
| 168 | + |
| 169 | + // check savepoint_path |
| 170 | + if (upgradeMode != UpgradeMode.STATELESS) { |
| 171 | + assertEquals( |
| 172 | + flinkService.getSubmittedConf().get(SavepointConfigOptions.SAVEPOINT_PATH), |
| 173 | + "ck_0"); |
| 174 | + } else { |
| 175 | + assertNull(flinkService.getSubmittedConf().get(SavepointConfigOptions.SAVEPOINT_PATH)); |
| 176 | + } |
| 177 | + } |
101 | 178 | }
|
0 commit comments