-
Notifications
You must be signed in to change notification settings - Fork 501
[FLINK-37730][Job Manager] Expose JM exception as K8s exceptions #978
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 13 commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
2ee8c55
[FLINK-37730][Job Manager] Expose JM exception as K8s exceptions
vsantwana 22b84b9
[FLINK-37730][dependency] Removes unintended dependency
vsantwana 7d011c3
[FLINK-37730][Observer] Introduces observer and related configuration
vsantwana c434722
[FLINK-37730][Cache] Add cache to store last recorded exception time
vsantwana 3c3a1b5
[FLINK-37730] Moves exception emitter to JobStatusObserver
vsantwana 6fdbf84
[FLINK-37730] Remove unintended changes
vsantwana 61ad2cd
[FLINK-37730][Review] Address comments
vsantwana f1e9320
Addresses review comments
vsantwana 4722465
Adds check for exceptions when prevState is terminal
vsantwana 5104e13
[FLINK-37730]Reverse the exception to get newer exceptions
vsantwana 8f1cab7
[FLINK-37730]Fixes the failures
vsantwana 1b5e654
Adds generated docs
vsantwana 7414fc4
Changes jobManagerDeployment to job
vsantwana 8c45e14
[FLINK-37730][Exception] Beautify Exception reporting in events
vsantwana 0f64ad4
Updates test to match new code
vsantwana File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,11 +28,21 @@ | |
| import org.apache.flink.kubernetes.operator.reconciler.ReconciliationUtils; | ||
| import org.apache.flink.kubernetes.operator.utils.EventRecorder; | ||
| import org.apache.flink.kubernetes.operator.utils.ExceptionUtils; | ||
| import org.apache.flink.kubernetes.operator.utils.K8sAnnotationsSanitizer; | ||
| import org.apache.flink.runtime.client.JobStatusMessage; | ||
| import org.apache.flink.runtime.rest.messages.JobExceptionsInfoWithHistory; | ||
|
|
||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import java.time.Instant; | ||
| import java.time.ZoneId; | ||
| import java.util.ArrayList; | ||
| import java.util.Comparator; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
| import java.util.concurrent.TimeoutException; | ||
|
|
||
| import static org.apache.flink.kubernetes.operator.utils.FlinkResourceExceptionUtils.updateFlinkResourceException; | ||
|
|
@@ -69,17 +79,20 @@ public boolean observe(FlinkResourceContext<R> ctx) { | |
| var jobStatus = resource.getStatus().getJobStatus(); | ||
| LOG.debug("Observing job status"); | ||
| var previousJobStatus = jobStatus.getState(); | ||
|
|
||
| var jobId = jobStatus.getJobId(); | ||
| try { | ||
| var newJobStatusOpt = | ||
| ctx.getFlinkService() | ||
| .getJobStatus( | ||
| ctx.getObserveConfig(), | ||
| JobID.fromHexString(jobStatus.getJobId())); | ||
| .getJobStatus(ctx.getObserveConfig(), JobID.fromHexString(jobId)); | ||
|
|
||
| if (newJobStatusOpt.isPresent()) { | ||
| updateJobStatus(ctx, newJobStatusOpt.get()); | ||
| var newJobStatus = newJobStatusOpt.get(); | ||
| updateJobStatus(ctx, newJobStatus); | ||
| ReconciliationUtils.checkAndUpdateStableSpec(resource.getStatus()); | ||
| // see if the JM server is up, try to get the exceptions | ||
| if (!previousJobStatus.isGloballyTerminalState()) { | ||
| observeJobManagerExceptions(ctx); | ||
| } | ||
| return true; | ||
| } else { | ||
| onTargetJobNotFound(ctx); | ||
|
|
@@ -95,6 +108,155 @@ public boolean observe(FlinkResourceContext<R> ctx) { | |
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Observe the exceptions raised in the job manager and take appropriate action. | ||
| * | ||
| * @param ctx the context with which the operation is executed | ||
| */ | ||
| protected void observeJobManagerExceptions(FlinkResourceContext<R> ctx) { | ||
| var resource = ctx.getResource(); | ||
| var operatorConfig = ctx.getOperatorConfig(); | ||
| var jobStatus = resource.getStatus().getJobStatus(); | ||
|
|
||
| try { | ||
| var jobId = JobID.fromHexString(jobStatus.getJobId()); | ||
| // TODO: Ideally the best way to restrict the number of events is to use the query param | ||
| // `maxExceptions` | ||
| // but the JobExceptionsMessageParameters does not expose the parameters and nor does | ||
| // it have setters. | ||
| var history = | ||
| ctx.getFlinkService().getJobExceptions(resource, jobId, ctx.getObserveConfig()); | ||
|
|
||
| if (history == null || history.getExceptionHistory() == null) { | ||
| return; | ||
| } | ||
|
|
||
| var exceptionHistory = history.getExceptionHistory(); | ||
| List<JobExceptionsInfoWithHistory.RootExceptionInfo> exceptions = | ||
| exceptionHistory.getEntries(); | ||
| if (exceptions == null || exceptions.isEmpty()) { | ||
| return; | ||
| } | ||
|
|
||
| if (exceptionHistory.isTruncated()) { | ||
| LOG.warn( | ||
| "Job exception history is truncated for jobId '{}'. Some exceptions may be missing.", | ||
| jobId); | ||
| } | ||
|
|
||
| String currentJobId = jobStatus.getJobId(); | ||
| Instant lastRecorded = null; // first reconciliation | ||
|
|
||
| var cacheEntry = ctx.getExceptionCacheEntry(); | ||
| // a cache entry is created should always be present. The timestamp for the first | ||
| // reconciliation would be | ||
| // when the job was created. This check is still necessary because even though there | ||
| // might be an entry, | ||
| // the jobId could have changed since the job was first created. | ||
| if (cacheEntry.getJobId() != null && cacheEntry.getJobId().equals(currentJobId)) { | ||
| lastRecorded = Instant.ofEpochMilli(cacheEntry.getLastTimestamp()); | ||
| } | ||
|
|
||
| int maxEvents = operatorConfig.getReportedExceptionEventsMaxCount(); | ||
| int maxStackTraceLines = operatorConfig.getReportedExceptionEventsMaxStackTraceLength(); | ||
|
|
||
| // Sort and reverse to prioritize the newest exceptions | ||
| var sortedExceptions = new ArrayList<>(exceptions); | ||
| sortedExceptions.sort( | ||
| Comparator.comparingLong( | ||
| JobExceptionsInfoWithHistory.RootExceptionInfo::getTimestamp) | ||
| .reversed()); | ||
| int count = 0; | ||
| Instant latestSeen = null; | ||
|
|
||
| for (var exception : sortedExceptions) { | ||
| Instant exceptionTime = Instant.ofEpochMilli(exception.getTimestamp()); | ||
| // Skip already recorded exceptions | ||
| if (lastRecorded != null && !exceptionTime.isAfter(lastRecorded)) { | ||
| break; | ||
| } | ||
| emitJobManagerExceptionEvent(ctx, exception, exceptionTime, maxStackTraceLines); | ||
| if (latestSeen == null || exceptionTime.isAfter(latestSeen)) { | ||
| latestSeen = exceptionTime; | ||
| } | ||
| if (++count >= maxEvents) { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| ctx.getExceptionCacheEntry().setJobId(currentJobId); | ||
| // Set to the timestamp of the latest emitted exception, if any were emitted | ||
| // the other option is that if no exceptions were emitted, we set this to now. | ||
| if (latestSeen != null) { | ||
| ctx.getExceptionCacheEntry().setLastTimestamp(latestSeen.toEpochMilli()); | ||
| } | ||
|
|
||
| } catch (Exception e) { | ||
| LOG.warn("Failed to fetch JobManager exception info.", e); | ||
| } | ||
| } | ||
|
|
||
| private void emitJobManagerExceptionEvent( | ||
| FlinkResourceContext<R> ctx, | ||
| JobExceptionsInfoWithHistory.RootExceptionInfo exception, | ||
| Instant exceptionTime, | ||
| int maxStackTraceLines) { | ||
|
|
||
| String exceptionName = exception.getExceptionName(); | ||
| if (exceptionName == null || exceptionName.isBlank()) { | ||
| return; | ||
| } | ||
| Map<String, String> annotations = new HashMap<>(); | ||
| if (exceptionTime != null) { | ||
| annotations.put( | ||
| "exception-timestamp", | ||
| exceptionTime.atZone(ZoneId.systemDefault()).toOffsetDateTime().toString()); | ||
| } | ||
| if (exception.getTaskName() != null) { | ||
| annotations.put("task-name", exception.getTaskName()); | ||
| } | ||
| if (exception.getEndpoint() != null) { | ||
| annotations.put("endpoint", exception.getEndpoint()); | ||
| } | ||
| if (exception.getTaskManagerId() != null) { | ||
| annotations.put("tm-id", exception.getTaskManagerId()); | ||
| } | ||
| if (exception.getFailureLabels() != null) { | ||
| exception | ||
| .getFailureLabels() | ||
| .forEach((k, v) -> annotations.put("failure-label-" + k, v)); | ||
| } | ||
|
|
||
| StringBuilder eventMessage = new StringBuilder(exceptionName); | ||
| String stacktrace = exception.getStacktrace(); | ||
| if (stacktrace != null && !stacktrace.isBlank()) { | ||
| String[] lines = stacktrace.split("\n"); | ||
| eventMessage.append("\n\nStacktrace (truncated):\n"); | ||
|
||
| for (int i = 0; i < Math.min(maxStackTraceLines, lines.length); i++) { | ||
| eventMessage.append(lines[i]).append("\n"); | ||
| } | ||
| if (lines.length > maxStackTraceLines) { | ||
| eventMessage | ||
| .append("... (") | ||
| .append(lines.length - maxStackTraceLines) | ||
| .append(" more lines)"); | ||
| } | ||
| } | ||
|
|
||
| String identityKey = | ||
| "jobmanager-exception-" | ||
| + Integer.toHexString(Objects.hash(eventMessage.toString())); | ||
| eventRecorder.triggerEventWithAnnotations( | ||
| ctx.getResource(), | ||
| EventRecorder.Type.Warning, | ||
| EventRecorder.Reason.JobException, | ||
| eventMessage.toString().trim(), | ||
| EventRecorder.Component.Job, | ||
| identityKey, | ||
| ctx.getKubernetesClient(), | ||
| K8sAnnotationsSanitizer.sanitizeAnnotations(annotations)); | ||
| } | ||
|
|
||
| /** | ||
| * Callback when no matching target job was found on a cluster where jobs were found. | ||
| * | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.

Uh oh!
There was an error while loading. Please reload this page.