1919
2020import org .apache .flink .api .common .JobID ;
2121import org .apache .flink .api .common .JobStatus ;
22- import org .apache .flink .autoscaler .utils .DateTimeUtils ;
2322import org .apache .flink .kubernetes .operator .api .AbstractFlinkResource ;
2423import org .apache .flink .kubernetes .operator .api .FlinkSessionJob ;
2524import org .apache .flink .kubernetes .operator .api .spec .JobState ;
4342import java .util .HashMap ;
4443import java .util .List ;
4544import java .util .Map ;
45+ import java .util .Objects ;
4646import java .util .concurrent .TimeoutException ;
4747
4848import static org .apache .flink .kubernetes .operator .utils .FlinkResourceExceptionUtils .updateFlinkResourceException ;
@@ -153,11 +153,10 @@ protected void observeJobManagerExceptions(FlinkResourceContext<R> ctx) {
153153 // when the job was created. This check is still necessary because even though there
154154 // might be an entry,
155155 // the jobId could have changed since the job was first created.
156- if (cacheEntry .getJobId ().equals (currentJobId )) {
156+ if (cacheEntry .getJobId () != null && cacheEntry . getJobId () .equals (currentJobId )) {
157157 lastRecorded = Instant .ofEpochMilli (cacheEntry .getLastTimestamp ());
158158 }
159159
160- Instant now = Instant .now ();
161160 int maxEvents = operatorConfig .getReportedExceptionEventsMaxCount ();
162161 int maxStackTraceLines = operatorConfig .getReportedExceptionEventsMaxStackTraceLength ();
163162
@@ -167,21 +166,31 @@ protected void observeJobManagerExceptions(FlinkResourceContext<R> ctx) {
167166 Comparator .comparingLong (
168167 JobExceptionsInfoWithHistory .RootExceptionInfo ::getTimestamp )
169168 .reversed ());
170-
171169 int count = 0 ;
170+ Instant latestSeen = null ;
171+
172172 for (var exception : sortedExceptions ) {
173173 Instant exceptionTime = Instant .ofEpochMilli (exception .getTimestamp ());
174174 // Skip already recorded exceptions
175- if (lastRecorded != null && exceptionTime .isBefore (lastRecorded )) {
176- continue ;
175+ if (lastRecorded != null && ! exceptionTime .isAfter (lastRecorded )) {
176+ break ;
177177 }
178178 emitJobManagerExceptionEvent (ctx , exception , exceptionTime , maxStackTraceLines );
179+ if (latestSeen == null || exceptionTime .isAfter (latestSeen )) {
180+ latestSeen = exceptionTime ;
181+ }
179182 if (++count >= maxEvents ) {
180183 break ;
181184 }
182185 }
186+
183187 ctx .getExceptionCacheEntry ().setJobId (currentJobId );
184- ctx .getExceptionCacheEntry ().setLastTimestamp (now .toEpochMilli ());
188+ // Set to the timestamp of the latest emitted exception, if any were emitted
189+ // the other option is that if no exceptions were emitted, we set this to now.
190+ if (latestSeen != null ) {
191+ ctx .getExceptionCacheEntry ().setLastTimestamp (latestSeen .toEpochMilli ());
192+ }
193+
185194 } catch (Exception e ) {
186195 LOG .warn ("Failed to fetch JobManager exception info." , e );
187196 }
@@ -197,13 +206,12 @@ private void emitJobManagerExceptionEvent(
197206 if (exceptionName == null || exceptionName .isBlank ()) {
198207 return ;
199208 }
200-
201209 Map <String , String > annotations = new HashMap <>();
202- annotations . put (
203- "event-time-readable" ,
204- DateTimeUtils . readable ( exceptionTime , ZoneId . systemDefault ()));
205- annotations . put ( "event-timestamp-millis" , String . valueOf ( exceptionTime . toEpochMilli () ));
206-
210+ if ( exceptionTime != null ) {
211+ annotations . put (
212+ "exception-timestamp" ,
213+ exceptionTime . atZone ( ZoneId . systemDefault ()). toOffsetDateTime (). toString ( ));
214+ }
207215 if (exception .getTaskName () != null ) {
208216 annotations .put ("task-name" , exception .getTaskName ());
209217 }
@@ -213,7 +221,6 @@ private void emitJobManagerExceptionEvent(
213221 if (exception .getTaskManagerId () != null ) {
214222 annotations .put ("tm-id" , exception .getTaskManagerId ());
215223 }
216-
217224 if (exception .getFailureLabels () != null ) {
218225 exception
219226 .getFailureLabels ()
@@ -236,16 +243,16 @@ private void emitJobManagerExceptionEvent(
236243 }
237244 }
238245
239- String keyMessage =
240- exceptionName . length () > 128 ? exceptionName . substring ( 0 , 128 ) : exceptionName ;
241-
242- eventRecorder .triggerEventOnceWithAnnotations (
246+ String identityKey =
247+ "jobmanager-exception-"
248+ + Integer . toHexString ( Objects . hash ( eventMessage . toString ()));
249+ eventRecorder .triggerEventWithAnnotations (
243250 ctx .getResource (),
244251 EventRecorder .Type .Warning ,
245252 EventRecorder .Reason .JobException ,
246253 eventMessage .toString ().trim (),
247254 EventRecorder .Component .JobManagerDeployment ,
248- "jobmanager-exception-" + keyMessage . hashCode () ,
255+ identityKey ,
249256 ctx .getKubernetesClient (),
250257 K8sAnnotationsSanitizer .sanitizeAnnotations (annotations ));
251258 }
0 commit comments