Skip to content

Commit 3c60c3c

Browse files
authored
[FLINK-37730] Improve exception recording ts initialization + 2.0 compatibility
1 parent 0679d63 commit 3c60c3c

File tree

8 files changed

+568
-75
lines changed

8 files changed

+568
-75
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ buildNumber.properties
3636
.idea
3737
*.iml
3838
*.DS_Store
39+
40+
.kube

flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/observer/JobStatusObserver.java

Lines changed: 59 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.flink.kubernetes.operator.controller.FlinkResourceContext;
2828
import org.apache.flink.kubernetes.operator.reconciler.ReconciliationUtils;
2929
import org.apache.flink.kubernetes.operator.utils.EventRecorder;
30+
import org.apache.flink.kubernetes.operator.utils.EventUtils;
3031
import org.apache.flink.kubernetes.operator.utils.ExceptionUtils;
3132
import org.apache.flink.kubernetes.operator.utils.K8sAnnotationsSanitizer;
3233
import org.apache.flink.runtime.client.JobStatusMessage;
@@ -35,12 +36,12 @@
3536
import org.slf4j.Logger;
3637
import org.slf4j.LoggerFactory;
3738

39+
import java.time.Duration;
3840
import java.time.Instant;
39-
import java.time.ZoneId;
4041
import java.util.ArrayList;
42+
import java.util.Collections;
4143
import java.util.Comparator;
4244
import java.util.HashMap;
43-
import java.util.List;
4445
import java.util.Map;
4546
import java.util.Objects;
4647
import java.util.concurrent.TimeoutException;
@@ -53,6 +54,8 @@ public class JobStatusObserver<R extends AbstractFlinkResource<?, ?>> {
5354
private static final Logger LOG = LoggerFactory.getLogger(JobStatusObserver.class);
5455

5556
public static final String JOB_NOT_FOUND_ERR = "Job Not Found";
57+
public static final String EXCEPTION_TIMESTAMP = "exception-timestamp";
58+
public static final Duration MAX_K8S_EVENT_AGE = Duration.ofMinutes(30);
5659

5760
protected final EventRecorder eventRecorder;
5861

@@ -132,65 +135,77 @@ protected void observeJobManagerExceptions(FlinkResourceContext<R> ctx) {
132135
}
133136

134137
var exceptionHistory = history.getExceptionHistory();
135-
List<JobExceptionsInfoWithHistory.RootExceptionInfo> exceptions =
136-
exceptionHistory.getEntries();
137-
if (exceptions == null || exceptions.isEmpty()) {
138-
return;
138+
var exceptions = exceptionHistory.getEntries();
139+
if (exceptions != null) {
140+
exceptions = new ArrayList<>(exceptions);
141+
exceptions.sort(
142+
Comparator.comparingLong(
143+
JobExceptionsInfoWithHistory.RootExceptionInfo
144+
::getTimestamp)
145+
.reversed());
146+
} else {
147+
exceptions = Collections.emptyList();
139148
}
140149

141-
if (exceptionHistory.isTruncated()) {
142-
LOG.warn(
143-
"Job exception history is truncated for jobId '{}'. Some exceptions may be missing.",
144-
jobId);
150+
String currentJobId = jobStatus.getJobId();
151+
var cacheEntry = ctx.getExceptionCacheEntry();
152+
153+
if (!cacheEntry.isInitialized()) {
154+
Instant lastExceptionTs;
155+
if (exceptions.isEmpty()) {
156+
// If the job doesn't have any exceptions set to MIN as we always have to record
157+
// the next
158+
lastExceptionTs = Instant.MIN;
159+
} else {
160+
var k8sExpirationTs = Instant.now().minus(MAX_K8S_EVENT_AGE);
161+
var maxJobExceptionTs = Instant.ofEpochMilli(exceptions.get(0).getTimestamp());
162+
if (maxJobExceptionTs.isBefore(k8sExpirationTs)) {
163+
// If the last job exception was a long time ago, then there is no point in
164+
// checking in k8s. We won't report this as exception
165+
lastExceptionTs = maxJobExceptionTs;
166+
} else {
167+
// If there were recent exceptions, we check the triggered events from kube
168+
// to make sure we don't double trigger
169+
lastExceptionTs =
170+
EventUtils.findLastJobExceptionTsFromK8s(
171+
ctx.getKubernetesClient(), resource)
172+
.orElse(k8sExpirationTs);
173+
}
174+
}
175+
176+
cacheEntry.setLastTimestamp(lastExceptionTs);
177+
cacheEntry.setInitialized(true);
178+
cacheEntry.setJobId(currentJobId);
145179
}
146180

147-
String currentJobId = jobStatus.getJobId();
148-
Instant lastRecorded = null; // first reconciliation
181+
var lastRecorded =
182+
currentJobId.equals(cacheEntry.getJobId())
183+
? cacheEntry.getLastTimestamp()
184+
: Instant.MIN;
149185

150-
var cacheEntry = ctx.getExceptionCacheEntry();
151-
// a cache entry is created should always be present. The timestamp for the first
152-
// reconciliation would be
153-
// when the job was created. This check is still necessary because even though there
154-
// might be an entry,
155-
// the jobId could have changed since the job was first created.
156-
if (cacheEntry.getJobId() != null && cacheEntry.getJobId().equals(currentJobId)) {
157-
lastRecorded = Instant.ofEpochMilli(cacheEntry.getLastTimestamp());
186+
if (exceptions.isEmpty()) {
187+
return;
158188
}
159189

160190
int maxEvents = operatorConfig.getReportedExceptionEventsMaxCount();
161191
int maxStackTraceLines = operatorConfig.getReportedExceptionEventsMaxStackTraceLength();
162192

163-
// Sort and reverse to prioritize the newest exceptions
164-
var sortedExceptions = new ArrayList<>(exceptions);
165-
sortedExceptions.sort(
166-
Comparator.comparingLong(
167-
JobExceptionsInfoWithHistory.RootExceptionInfo::getTimestamp)
168-
.reversed());
169193
int count = 0;
170-
Instant latestSeen = null;
171-
172-
for (var exception : sortedExceptions) {
173-
Instant exceptionTime = Instant.ofEpochMilli(exception.getTimestamp());
174-
// Skip already recorded exceptions
175-
if (lastRecorded != null && !exceptionTime.isAfter(lastRecorded)) {
194+
for (var exception : exceptions) {
195+
var exceptionTime = Instant.ofEpochMilli(exception.getTimestamp());
196+
// Skip already recorded exceptions and after max count
197+
if (!exceptionTime.isAfter(lastRecorded) || count++ >= maxEvents) {
176198
break;
177199
}
178200
emitJobManagerExceptionEvent(ctx, exception, exceptionTime, maxStackTraceLines);
179-
if (latestSeen == null) {
180-
latestSeen = exceptionTime;
181-
}
182-
if (++count >= maxEvents) {
183-
break;
184-
}
185201
}
186202

187-
ctx.getExceptionCacheEntry().setJobId(currentJobId);
188-
// Set to the timestamp of the latest emitted exception, if any were emitted
189-
// the other option is that if no exceptions were emitted, we set this to now.
190-
if (latestSeen != null) {
191-
ctx.getExceptionCacheEntry().setLastTimestamp(latestSeen.toEpochMilli());
203+
if (count > maxEvents) {
204+
LOG.warn("Job exception history is truncated. Some exceptions may be missing.");
192205
}
193206

207+
cacheEntry.setJobId(currentJobId);
208+
cacheEntry.setLastTimestamp(Instant.ofEpochMilli(exceptions.get(0).getTimestamp()));
194209
} catch (Exception e) {
195210
LOG.warn("Failed to fetch JobManager exception info.", e);
196211
}
@@ -203,9 +218,7 @@ private void emitJobManagerExceptionEvent(
203218
int maxStackTraceLines) {
204219
Map<String, String> annotations = new HashMap<>();
205220
if (exceptionTime != null) {
206-
annotations.put(
207-
"exception-timestamp",
208-
exceptionTime.atZone(ZoneId.systemDefault()).toOffsetDateTime().toString());
221+
annotations.put(EXCEPTION_TIMESTAMP, exceptionTime.toString());
209222
}
210223
if (exception.getTaskName() != null) {
211224
annotations.put("task-name", exception.getTaskName());

flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/service/FlinkResourceContextFactory.java

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.slf4j.Logger;
4444
import org.slf4j.LoggerFactory;
4545

46+
import java.time.Instant;
4647
import java.util.Map;
4748
import java.util.concurrent.ConcurrentHashMap;
4849
import java.util.concurrent.ExecutorService;
@@ -57,12 +58,8 @@ public class FlinkResourceContextFactory {
5758
@Data
5859
public static final class ExceptionCacheEntry {
5960
private String jobId;
60-
private long lastTimestamp;
61-
62-
public ExceptionCacheEntry(String jobId, long lastTimestamp) {
63-
this.jobId = jobId;
64-
this.lastTimestamp = lastTimestamp;
65-
}
61+
private Instant lastTimestamp;
62+
private boolean initialized;
6663
}
6764

6865
@VisibleForTesting
@@ -128,10 +125,7 @@ public FlinkStateSnapshotContext getFlinkStateSnapshotContext(
128125
configManager,
129126
this::getFlinkService,
130127
lastRecordedExceptionCache.computeIfAbsent(
131-
resourceId,
132-
id ->
133-
new ExceptionCacheEntry(
134-
flinkDepJobId, System.currentTimeMillis())));
128+
resourceId, id -> new ExceptionCacheEntry()));
135129
} else if (resource instanceof FlinkSessionJob) {
136130
var resourceId = ResourceID.fromResource(resource);
137131
var flinkSessionJobId = jobId;
@@ -143,11 +137,7 @@ public FlinkStateSnapshotContext getFlinkStateSnapshotContext(
143137
configManager,
144138
this::getFlinkService,
145139
lastRecordedExceptionCache.computeIfAbsent(
146-
resourceId,
147-
id ->
148-
new ExceptionCacheEntry(
149-
flinkSessionJobId,
150-
System.currentTimeMillis())));
140+
resourceId, id -> new ExceptionCacheEntry()));
151141
} else {
152142
throw new IllegalArgumentException(
153143
"Unknown resource type " + resource.getClass().getSimpleName());

flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/utils/EventUtils.java

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import org.apache.flink.annotation.VisibleForTesting;
2121
import org.apache.flink.kubernetes.operator.exception.DeploymentFailedException;
22+
import org.apache.flink.kubernetes.operator.observer.JobStatusObserver;
2223

2324
import io.fabric8.kubernetes.api.model.Event;
2425
import io.fabric8.kubernetes.api.model.EventBuilder;
@@ -38,6 +39,7 @@
3839
import java.time.Duration;
3940
import java.time.Instant;
4041
import java.util.ArrayList;
42+
import java.util.Comparator;
4143
import java.util.List;
4244
import java.util.Map;
4345
import java.util.Optional;
@@ -283,13 +285,13 @@ private static Optional<Event> createOrReplaceEvent(KubernetesClient client, Eve
283285
return Optional.empty();
284286
}
285287

286-
private static List<Event> getPodEvents(KubernetesClient client, Pod pod) {
287-
var ref = getObjectReference(pod);
288+
private static List<Event> getResourceEvents(KubernetesClient client, HasMetadata cr) {
289+
var ref = getObjectReference(cr);
288290

289291
var eventList =
290292
client.v1()
291293
.events()
292-
.inNamespace(pod.getMetadata().getNamespace())
294+
.inNamespace(cr.getMetadata().getNamespace())
293295
.withInvolvedObject(ref)
294296
.list();
295297

@@ -343,7 +345,7 @@ public static void checkForVolumeMountErrors(KubernetesClient client, Pod pod) {
343345
boolean notReady = checkStatusWasAlways(pod, conditionMap.get("Ready"), "False");
344346

345347
if (notReady && failedInitialization) {
346-
getPodEvents(client, pod).stream()
348+
getResourceEvents(client, pod).stream()
347349
.filter(e -> e.getReason().equals("FailedMount"))
348350
.findAny()
349351
.ifPresent(
@@ -356,4 +358,20 @@ public static void checkForVolumeMountErrors(KubernetesClient client, Pod pod) {
356358
private static boolean checkStatusWasAlways(Pod pod, PodCondition condition, String status) {
357359
return condition != null && condition.getStatus().equals(status);
358360
}
361+
362+
public static Optional<Instant> findLastJobExceptionTsFromK8s(
363+
KubernetesClient client, HasMetadata cr) {
364+
var events = getResourceEvents(client, cr);
365+
return events.stream()
366+
.filter(e -> EventRecorder.Reason.JobException.name().equals(e.getReason()))
367+
.map(
368+
e ->
369+
Instant.parse(
370+
e.getMetadata()
371+
.getAnnotations()
372+
.getOrDefault(
373+
JobStatusObserver.EXCEPTION_TIMESTAMP,
374+
e.getMetadata().getCreationTimestamp())))
375+
.max(Comparator.naturalOrder());
376+
}
359377
}

0 commit comments

Comments
 (0)