|
20 | 20 | import org.apache.flink.api.common.JobID; |
21 | 21 | import org.apache.flink.api.common.JobStatus; |
22 | 22 | import org.apache.flink.autoscaler.JobAutoScaler; |
| 23 | +import org.apache.flink.autoscaler.utils.DateTimeUtils; |
23 | 24 | import org.apache.flink.configuration.Configuration; |
24 | 25 | import org.apache.flink.configuration.HighAvailabilityOptions; |
25 | 26 | import org.apache.flink.configuration.PipelineOptionsInternal; |
|
45 | 46 | import org.apache.flink.runtime.highavailability.JobResultStoreOptions; |
46 | 47 | import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; |
47 | 48 | import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; |
| 49 | +import org.apache.flink.runtime.rest.messages.JobExceptionsInfoWithHistory; |
48 | 50 | import org.apache.flink.util.Preconditions; |
49 | 51 |
|
50 | 52 | import io.fabric8.kubernetes.client.KubernetesClient; |
|
55 | 57 | import org.slf4j.LoggerFactory; |
56 | 58 |
|
57 | 59 | import java.time.Instant; |
| 60 | +import java.time.ZoneId; |
| 61 | +import java.util.Map; |
58 | 62 | import java.util.Optional; |
59 | 63 | import java.util.UUID; |
60 | 64 |
|
@@ -299,9 +303,92 @@ public boolean reconcileOtherChanges(FlinkResourceContext<FlinkDeployment> ctx) |
299 | 303 | return true; |
300 | 304 | } |
301 | 305 |
|
| 306 | + // check for JobManager exceptions if the REST API server is still up. |
| 307 | + if (!ReconciliationUtils.isJobInTerminalState(deployment.getStatus())) { |
| 308 | + observeJobManagerExceptions(ctx, deployment, observeConfig); |
| 309 | + } |
| 310 | + |
302 | 311 | return cleanupTerminalJmAfterTtl(ctx.getFlinkService(), deployment, observeConfig); |
303 | 312 | } |
304 | 313 |
|
| 314 | + private void observeJobManagerExceptions( |
| 315 | + FlinkResourceContext<FlinkDeployment> ctx, |
| 316 | + FlinkDeployment deployment, |
| 317 | + Configuration observeConfig) { |
| 318 | + try { |
| 319 | + var jobId = JobID.fromHexString(deployment.getStatus().getJobStatus().getJobId()); |
| 320 | + var history = ctx.getFlinkService().getJobExceptions(deployment, jobId, observeConfig); |
| 321 | + if (history == null || history.getExceptionHistory() == null) { |
| 322 | + return; |
| 323 | + } |
| 324 | + var exceptionHistory = history.getExceptionHistory(); |
| 325 | + var exceptions = exceptionHistory.getEntries(); |
| 326 | + if (exceptions.isEmpty()) { |
| 327 | + LOG.info(String.format("No exceptions found in job exception history for jobId '%s'.", jobId)); |
| 328 | + return; |
| 329 | + } |
| 330 | + if (exceptionHistory.isTruncated()) { |
| 331 | + LOG.warn(String.format("Job exception history is truncated for jobId '%s'. " |
| 332 | + + "Some exceptions are not shown.", jobId)); |
| 333 | + } |
| 334 | + for (var exception : exceptions) { |
| 335 | + emitJobManagerExceptionEvent(ctx, deployment, exception); |
| 336 | + } |
| 337 | + } catch (Exception e) { |
| 338 | + LOG.warn("Could not fetch JobManager exception info.", e); |
| 339 | + } |
| 340 | + } |
| 341 | + |
| 342 | + private void emitJobManagerExceptionEvent( |
| 343 | + FlinkResourceContext<FlinkDeployment> ctx, |
| 344 | + FlinkDeployment deployment, |
| 345 | + JobExceptionsInfoWithHistory.RootExceptionInfo exception) { |
| 346 | + |
| 347 | + String message = exception.getExceptionName(); |
| 348 | + if (message == null || message.isBlank()) { |
| 349 | + return; |
| 350 | + } |
| 351 | + |
| 352 | + String stacktrace = exception.getStacktrace(); |
| 353 | + String taskName = exception.getTaskName(); |
| 354 | + String endpoint = exception.getEndpoint(); |
| 355 | + String tmId = exception.getTaskManagerId(); |
| 356 | + Map<String, String> labels = exception.getFailureLabels(); |
| 357 | + String time = DateTimeUtils.readable(Instant.ofEpochMilli(exception.getTimestamp()), ZoneId.systemDefault()); |
| 358 | + |
| 359 | + StringBuilder combined = new StringBuilder(); |
| 360 | + combined.append("JobManager Exception at ").append(time).append(":\n"); |
| 361 | + combined.append(message).append("\n\n"); |
| 362 | + |
| 363 | + if (taskName != null) { |
| 364 | + combined.append("Task: ").append(taskName).append("\n"); |
| 365 | + } |
| 366 | + if (endpoint != null) { |
| 367 | + combined.append("Endpoint: ").append(endpoint).append("\n"); |
| 368 | + } |
| 369 | + if (tmId != null) { |
| 370 | + combined.append("TaskManager ID: ").append(tmId).append("\n"); |
| 371 | + } |
| 372 | + |
| 373 | + if (labels != null && !labels.isEmpty()) { |
| 374 | + combined.append("Failure Labels:\n"); |
| 375 | + labels.forEach((k, v) -> combined.append("- ").append(k).append(": ").append(v).append("\n")); |
| 376 | + } |
| 377 | + |
| 378 | + if (stacktrace != null && !stacktrace.isBlank()) { |
| 379 | + combined.append("\nStacktrace:\n").append(stacktrace); |
| 380 | + } |
| 381 | + |
| 382 | + eventRecorder.triggerEventOnce( |
| 383 | + deployment, |
| 384 | + EventRecorder.Type.Warning, |
| 385 | + EventRecorder.Reason.JobManagerException, |
| 386 | + combined.toString(), |
| 387 | + EventRecorder.Component.JobManagerDeployment, |
| 388 | + "jobmanager-exception-" + message.hashCode(), |
| 389 | + ctx.getKubernetesClient()); |
| 390 | + } |
| 391 | + |
305 | 392 | private boolean shouldRestartJobBecauseUnhealthy( |
306 | 393 | FlinkDeployment deployment, Configuration observeConfig) { |
307 | 394 | boolean restartNeeded = false; |
|
0 commit comments