2020import java .net .http .HttpClient ;
2121import java .net .http .HttpRequest ;
2222import java .net .http .HttpResponse ;
23+ import java .nio .charset .StandardCharsets ;
2324import java .time .Duration ;
2425import java .time .Instant ;
2526import java .util .ArrayList ;
2627import java .util .Arrays ;
28+ import java .util .HashMap ;
2729import java .util .List ;
2830import java .util .Map ;
2931import java .util .Optional ;
3032import java .util .Set ;
33+ import java .util .UUID ;
3134import java .util .concurrent .ConcurrentHashMap ;
3235import java .util .concurrent .ConcurrentLinkedQueue ;
3336import java .util .concurrent .CopyOnWriteArrayList ;
@@ -82,6 +85,9 @@ public class PodFailureWatcher {
8285 private final ConcurrentLinkedQueue <String > pendingFailureQueue = new ConcurrentLinkedQueue <>();
8386 private final Set <String > queuedFailureKeys =
8487 java .util .Collections .newSetFromMap (new ConcurrentHashMap <>());
88+ // Pod annotations
89+ private static final String ANNOTATION_PREFIX = "podmortem.io/" ;
90+ private static final String ANALYSIS_ID_ANNOTATION = ANNOTATION_PREFIX + "analysis-id" ;
8591
8692 /**
8793 * Initializes the pod failure watcher on application startup.
@@ -401,6 +407,26 @@ private boolean hasPodFailed(Pod pod) {
401407 private void handlePodFailure (Pod pod ) {
402408 String podKey = pod .getMetadata ().getNamespace () + "/" + pod .getMetadata ().getName ();
403409
410+ // Skip if an analysis-id annotation already exists (already analyzed or in-progress)
411+ try {
412+ Pod latest =
413+ client .pods ()
414+ .inNamespace (pod .getMetadata ().getNamespace ())
415+ .withName (pod .getMetadata ().getName ())
416+ .get ();
417+ if (latest != null
418+ && latest .getMetadata ().getAnnotations () != null
419+ && latest .getMetadata ().getAnnotations ().containsKey (ANALYSIS_ID_ANNOTATION )) {
420+ log .debug (
421+ "Skipping analysis for {}: analysis-id already present: {}" ,
422+ podKey ,
423+ latest .getMetadata ().getAnnotations ().get (ANALYSIS_ID_ANNOTATION ));
424+ return ;
425+ }
426+ } catch (Exception e ) {
427+ log .warn ("Failed to check existing analysis-id for {}: {}" , podKey , e .getMessage ());
428+ }
429+
404430 // find matching Podmortem resources first; if none, ignore silently
405431 List <Podmortem > podmortemResources = findMatchingPodmortemResources (pod );
406432 if (podmortemResources .isEmpty ()) {
@@ -425,12 +451,108 @@ private void handlePodFailure(Pod pod) {
425451 processedFailures .put (podKey , failureTime );
426452 }
427453
454+ // Assign and annotate a deterministic analysis ID before starting analysis to prevent
455+ // duplicates
456+ String analysisId = generateAnalysisId (pod , failureTime );
457+ boolean annotated = annotateAnalysisIdWithRetry (pod , analysisId , 0 , 5 , 100 );
458+ if (!annotated ) {
459+ log .warn (
460+ "Proceeding without analysis-id annotation for {} (id: {})" ,
461+ podKey ,
462+ analysisId );
463+ }
464+
428465 for (Podmortem podmortem : podmortemResources ) {
429466 eventService .emitFailureDetected (pod , podmortem );
430467 processPodFailureForPodmortem (podmortem , pod );
431468 }
432469 }
433470
471+ /**
472+ * Generates a deterministic analysis ID based on pod UID and failure timestamp.
473+ *
474+ * <p>Uses name-based UUID so the same pod failure yields the same ID across restarts. Falls
475+ * back to a random UUID if the failure time is unavailable.
476+ */
477+ private String generateAnalysisId (Pod pod , Instant failureTime ) {
478+ try {
479+ String uid =
480+ pod .getMetadata ().getUid () != null
481+ ? pod .getMetadata ().getUid ()
482+ : pod .getMetadata ().getName ();
483+ String failureComponent = failureTime != null ? failureTime .toString () : "no-time" ;
484+ String seed = uid + ":" + failureComponent ;
485+ return UUID .nameUUIDFromBytes (seed .getBytes (StandardCharsets .UTF_8 )).toString ();
486+ } catch (Exception e ) {
487+ return UUID .randomUUID ().toString ();
488+ }
489+ }
490+
491+ /**
492+ * Adds the analysis-id annotation to the pod with retry and backoff, if not already present.
493+ */
494+ private boolean annotateAnalysisIdWithRetry (
495+ Pod pod , String analysisId , int attempt , int maxRetries , long delayMs ) {
496+ if (attempt > 0 ) {
497+ try {
498+ Thread .sleep (delayMs );
499+ } catch (InterruptedException e ) {
500+ Thread .currentThread ().interrupt ();
501+ return false ;
502+ }
503+ }
504+
505+ try {
506+ Pod latest =
507+ client .pods ()
508+ .inNamespace (pod .getMetadata ().getNamespace ())
509+ .withName (pod .getMetadata ().getName ())
510+ .get ();
511+ if (latest == null ) {
512+ return false ;
513+ }
514+
515+ Map <String , String > annotations = latest .getMetadata ().getAnnotations ();
516+ if (annotations == null ) {
517+ annotations = new HashMap <>();
518+ }
519+
520+ String key = ANALYSIS_ID_ANNOTATION ;
521+ if (annotations .containsKey (key )) {
522+ return true ;
523+ }
524+
525+ annotations .put (key , analysisId );
526+ latest .getMetadata ().setAnnotations (annotations );
527+ client .pods ()
528+ .inNamespace (latest .getMetadata ().getNamespace ())
529+ .withName (latest .getMetadata ().getName ())
530+ .patch (latest );
531+ return true ;
532+
533+ } catch (io .fabric8 .kubernetes .client .KubernetesClientException e ) {
534+ if (e .getCode () == 409 && attempt < maxRetries ) {
535+ return annotateAnalysisIdWithRetry (
536+ pod , analysisId , attempt + 1 , maxRetries , delayMs * 2 );
537+ } else if (e .getCode () == 403 ) {
538+ log .warn (
539+ "Forbidden to set analysis-id annotation for {} - check RBAC permissions: {}" ,
540+ pod .getMetadata ().getName (),
541+ e .getMessage ());
542+ return false ;
543+ } else {
544+ log .debug (
545+ "Failed to set analysis-id annotation (attempt {}): {}" ,
546+ attempt + 1 ,
547+ e .getMessage ());
548+ return false ;
549+ }
550+ } catch (Exception e ) {
551+ log .debug ("Unexpected error setting analysis-id annotation: {}" , e .getMessage ());
552+ return false ;
553+ }
554+ }
555+
434556 /**
435557 * Extracts the failure timestamp from a pod's container status.
436558 *
0 commit comments