|
41 | 41 | import org.dinky.data.model.ClusterInstance; |
42 | 42 | import org.dinky.data.model.SystemConfiguration; |
43 | 43 | import org.dinky.data.model.ext.JobInfoDetail; |
| 44 | +import org.dinky.data.model.job.History; |
44 | 45 | import org.dinky.data.model.job.JobInstance; |
45 | 46 | import org.dinky.gateway.Gateway; |
46 | 47 | import org.dinky.gateway.config.GatewayConfig; |
|
52 | 53 | import org.dinky.service.HistoryService; |
53 | 54 | import org.dinky.service.JobHistoryService; |
54 | 55 | import org.dinky.service.JobInstanceService; |
| 56 | +import org.dinky.service.TaskService; |
55 | 57 | import org.dinky.utils.JsonUtils; |
56 | 58 | import org.dinky.utils.TimeUtil; |
57 | 59 |
|
@@ -83,12 +85,14 @@ public class JobRefreshHandler { |
83 | 85 | private static final JobHistoryService jobHistoryService; |
84 | 86 | private static final ClusterInstanceService clusterInstanceService; |
85 | 87 | private static final HistoryService historyService; |
| 88 | + private static final TaskService taskService; |
86 | 89 |
|
87 | 90 | static { |
88 | 91 | jobInstanceService = SpringContextUtils.getBean("jobInstanceServiceImpl", JobInstanceService.class); |
89 | 92 | jobHistoryService = SpringContextUtils.getBean("jobHistoryServiceImpl", JobHistoryService.class); |
90 | 93 | clusterInstanceService = SpringContextUtils.getBean("clusterInstanceServiceImpl", ClusterInstanceService.class); |
91 | 94 | historyService = SpringContextUtils.getBean("historyServiceImpl", HistoryService.class); |
| 95 | + taskService = SpringContextUtils.getBean("taskServiceImpl", TaskService.class); |
92 | 96 | } |
93 | 97 |
|
94 | 98 | /** |
@@ -170,7 +174,9 @@ public static boolean refreshJob(JobInfoDetail jobInfoDetail, boolean needSave) |
170 | 174 |
|
171 | 175 | boolean isTransition = false; |
172 | 176 |
|
173 | | - if (JobStatus.isTransition(jobInstance.getStatus())) { |
| 177 | + if (JobStatus.isTransition( |
| 178 | + jobInstance.getStatus(), |
| 179 | + Asserts.isNull(jobDataDto.getJob()) ? null : jobDataDto.getJob().getEndTime())) { |
174 | 180 | Long finishTime = TimeUtil.localDateTimeToLong(jobInstance.getFinishTime()); |
175 | 181 | long duration = Duration.between(jobInstance.getFinishTime(), LocalDateTime.now()) |
176 | 182 | .toMinutes(); |
@@ -217,6 +223,10 @@ public static boolean refreshJob(JobInfoDetail jobInfoDetail, boolean needSave) |
217 | 223 | if (isDone) { |
218 | 224 | try { |
219 | 225 | log.debug("Job is done: {}->{}", jobInstance.getId(), jobInstance.getName()); |
| 226 | + // 检查是否需要自动重启 |
| 227 | + if (shouldAutoRestart(jobInstance, jobInfoDetail)) { |
| 228 | + tryAutoRestart(jobInstance, jobInfoDetail); |
| 229 | + } |
220 | 230 | handleJobDone(jobInfoDetail); |
221 | 231 | } catch (Exception e) { |
222 | 232 | log.error("failed handel job done:", e); |
@@ -346,6 +356,94 @@ private static void handleJobDone(JobInfoDetail jobInfoDetail) { |
346 | 356 | } |
347 | 357 | } |
348 | 358 |
|
| 359 | + /** |
| 360 | + * Check if the job should be auto-restarted. |
| 361 | + * |
| 362 | + * @param jobInstance The job instance. |
| 363 | + * @param jobInfoDetail The job info detail. |
| 364 | + * @return True if the job should be auto-restarted, false otherwise. |
| 365 | + */ |
| 366 | + private static boolean shouldAutoRestart(JobInstance jobInstance, JobInfoDetail jobInfoDetail) { |
| 367 | + String status = jobInstance.getStatus(); |
| 368 | + // 只对FAILED和UNKNOWN状态进行自动重启 |
| 369 | + if (!JobStatus.FAILED.getValue().equals(status) |
| 370 | + && !JobStatus.UNKNOWN.getValue().equals(status)) { |
| 371 | + return false; |
| 372 | + } |
| 373 | + |
| 374 | + // 检查任务配置中是否启用了自动重启 |
| 375 | + try { |
| 376 | + History history = jobInfoDetail.getHistory(); |
| 377 | + if (Asserts.isNull(history) || Asserts.isNull(history.getConfigJson())) { |
| 378 | + return false; |
| 379 | + } |
| 380 | + |
| 381 | + JobConfig jobConfig = history.getConfigJson(); |
| 382 | + Boolean autoRestart = jobConfig.getAutoRestart(); |
| 383 | + return Boolean.TRUE.equals(autoRestart); |
| 384 | + } catch (Exception e) { |
| 385 | + log.warn("Failed to check auto restart config for job {}: {}", jobInstance.getId(), e.getMessage()); |
| 386 | + return false; |
| 387 | + } |
| 388 | + } |
| 389 | + |
| 390 | + /** |
| 391 | + * Try to auto-restart the job from the latest checkpoint. |
| 392 | + * |
| 393 | + * @param jobInstance The job instance. |
| 394 | + * @param jobInfoDetail The job info detail. |
| 395 | + */ |
| 396 | + private static void tryAutoRestart(JobInstance jobInstance, JobInfoDetail jobInfoDetail) { |
| 397 | + if (Asserts.isNull(jobInstance.getTaskId())) { |
| 398 | + log.warn("Cannot auto restart job {}: taskId is null", jobInstance.getId()); |
| 399 | + return; |
| 400 | + } |
| 401 | + |
| 402 | + try { |
| 403 | + // 获取最新的checkpoint路径 |
| 404 | + String checkpointPath = getLatestCheckpointPath(jobInfoDetail.getJobDataDto()); |
| 405 | + log.info("Auto restarting job {} from checkpoint: {}", jobInstance.getId(), checkpointPath); |
| 406 | + taskService.restartTask(jobInstance.getTaskId(), checkpointPath); |
| 407 | + log.info("Auto restart job {} triggered successfully", jobInstance.getId()); |
| 408 | + } catch (Exception e) { |
| 409 | + log.error("Failed to auto restart job {}: {}", jobInstance.getId(), e.getMessage(), e); |
| 410 | + } |
| 411 | + } |
| 412 | + |
| 413 | + /** |
| 414 | + * Get the latest checkpoint path from JobDataDto. |
| 415 | + * |
| 416 | + * @param jobDataDto The job data DTO. |
| 417 | + * @return The latest checkpoint path, or null if not found. |
| 418 | + */ |
| 419 | + private static String getLatestCheckpointPath(JobDataDto jobDataDto) { |
| 420 | + if (Asserts.isNull(jobDataDto) || Asserts.isNull(jobDataDto.getCheckpoints())) { |
| 421 | + return null; |
| 422 | + } |
| 423 | + |
| 424 | + CheckPointOverView checkpoints = jobDataDto.getCheckpoints(); |
| 425 | + CheckPointOverView.LatestCheckpoints latestCheckpoints = checkpoints.getLatestCheckpoints(); |
| 426 | + if (Asserts.isNull(latestCheckpoints)) { |
| 427 | + return null; |
| 428 | + } |
| 429 | + |
| 430 | + // 优先使用completed checkpoint |
| 431 | + CheckPointOverView.CompletedCheckpointStatistics completedCheckpoint = |
| 432 | + latestCheckpoints.getCompletedCheckpointStatistics(); |
| 433 | + if (Asserts.isNotNull(completedCheckpoint) && Asserts.isNotNullString(completedCheckpoint.getExternalPath())) { |
| 434 | + return completedCheckpoint.getExternalPath(); |
| 435 | + } |
| 436 | + |
| 437 | + // 如果没有completed checkpoint,尝试使用savepoint |
| 438 | + CheckPointOverView.CompletedCheckpointStatistics savepointStatistics = |
| 439 | + latestCheckpoints.getSavepointStatistics(); |
| 440 | + if (Asserts.isNotNull(savepointStatistics) && Asserts.isNotNullString(savepointStatistics.getExternalPath())) { |
| 441 | + return savepointStatistics.getExternalPath(); |
| 442 | + } |
| 443 | + |
| 444 | + return null; |
| 445 | + } |
| 446 | + |
349 | 447 | /** |
350 | 448 | * In a YARN cluster with HA mode enabled, |
351 | 449 | * if the jobManagerHost cannot be connected, |
|
0 commit comments